KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > search > extractors > CmsExtractorMsExcel


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/CmsExtractorMsExcel.java,v $
3  * Date : $Date: 2005/06/23 11:11:28 $
4  * Version: $Revision: 1.8 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.search.extractors;
33
34 import org.opencms.util.CmsStringUtil;
35
36 import java.io.IOException JavaDoc;
37 import java.io.InputStream JavaDoc;
38 import java.util.Iterator JavaDoc;
39 import java.util.Map JavaDoc;
40
41 import org.apache.poi.hssf.usermodel.HSSFCell;
42 import org.apache.poi.hssf.usermodel.HSSFRow;
43 import org.apache.poi.hssf.usermodel.HSSFSheet;
44 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
45 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
46
47 /**
48  * Extracts the text form an MS Excel document.<p>
49  *
50  * @author Alexander Kandzior
51  *
52  * @version $Revision: 1.8 $
53  *
54  * @since 6.0.0
55  */

56 public final class CmsExtractorMsExcel extends A_CmsTextExtractorMsOfficeBase {
57
58     /** Static member instance of the extractor. */
59     private static final CmsExtractorMsExcel INSTANCE = new CmsExtractorMsExcel();
60
61     /**
62      * Hide the public constructor.<p>
63      */

64     private CmsExtractorMsExcel() {
65
66         // noop
67
}
68
69     /**
70      * Returns an instance of this text extractor.<p>
71      *
72      * @return an instance of this text extractor
73      */

74     public static I_CmsTextExtractor getExtractor() {
75
76         return INSTANCE;
77     }
78
79     /**
80      * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
81      */

82     public I_CmsExtractionResult extractText(InputStream JavaDoc in, String JavaDoc encoding) throws Exception JavaDoc {
83
84         // first extract the table content
85
String JavaDoc result = extractTableContent(getStreamCopy(in));
86         result = removeControlChars(result);
87
88         // now extract the meta information using POI
89
POIFSReader reader = new POIFSReader();
90         reader.registerListener(this);
91         reader.read(getStreamCopy(in));
92         Map JavaDoc metaInfo = extractMetaInformation();
93
94         // return the final result
95
return new CmsExtractionResult(result, metaInfo);
96     }
97
98     /**
99      * Extracts the text from the Excel table content.<p>
100      *
101      * @param in the document input stream
102      * @return the extracted text
103      * @throws IOException if something goes wring
104      */

105     protected String JavaDoc extractTableContent(InputStream JavaDoc in) throws IOException JavaDoc {
106
107         HSSFWorkbook excelWb = new HSSFWorkbook(in);
108         StringBuffer JavaDoc result = new StringBuffer JavaDoc(4096);
109
110         int numberOfSheets = excelWb.getNumberOfSheets();
111
112         for (int i = 0; i < numberOfSheets; i++) {
113             HSSFSheet sheet = excelWb.getSheetAt(i);
114             int numberOfRows = sheet.getPhysicalNumberOfRows();
115             if (numberOfRows > 0) {
116
117                 if (CmsStringUtil.isNotEmpty(excelWb.getSheetName(i))) {
118                     // append sheet name to content
119
if (i > 0) {
120                         result.append("\n\n");
121                     }
122                     result.append(excelWb.getSheetName(i).trim());
123                     result.append(":\n\n");
124                 }
125
126                 Iterator JavaDoc rowIt = sheet.rowIterator();
127                 while (rowIt.hasNext()) {
128                     HSSFRow row = (HSSFRow)rowIt.next();
129                     if (row != null) {
130                         boolean hasContent = false;
131                         Iterator JavaDoc it = row.cellIterator();
132                         while (it.hasNext()) {
133                             HSSFCell cell = (HSSFCell)it.next();
134                             String JavaDoc text = null;
135                             try {
136                                 switch (cell.getCellType()) {
137                                     case HSSFCell.CELL_TYPE_BLANK:
138                                     case HSSFCell.CELL_TYPE_ERROR:
139                                         // ignore all blank or error cells
140
break;
141                                     case HSSFCell.CELL_TYPE_NUMERIC:
142                                         text = Double.toString(cell.getNumericCellValue());
143                                         break;
144                                     case HSSFCell.CELL_TYPE_BOOLEAN:
145                                         text = Boolean.toString(cell.getBooleanCellValue());
146                                         break;
147                                     case HSSFCell.CELL_TYPE_STRING:
148                                     default:
149                                         text = cell.getStringCellValue();
150                                         break;
151                                 }
152                             } catch (Exception JavaDoc e) {
153                                 // ignore this cell
154
}
155                             if (CmsStringUtil.isNotEmpty(text)) {
156                                 result.append(text.trim());
157                                 result.append(' ');
158                                 hasContent = true;
159                             }
160                         }
161                         if (hasContent) {
162                             // append a newline at the end of each row that has content
163
result.append('\n');
164                         }
165                     }
166                 }
167             }
168         }
169
170         return result.toString();
171     }
172
173 }
Popular Tags