KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > alfresco > repo > content > transform > PoiHssfContentTransformer


1 /*
2  * Copyright (C) 2005 Alfresco, Inc.
3  *
4  * Licensed under the Mozilla Public License version 1.1
5  * with a permitted attribution clause. You may obtain a
6  * copy of the License at
7  *
8  * http://www.alfresco.org/legal/license.txt
9  *
10  * Unless required by applicable law or agreed to in writing,
11  * software distributed under the License is distributed on an
12  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13  * either express or implied. See the License for the specific
14  * language governing permissions and limitations under the
15  * License.
16  */

17 package org.alfresco.repo.content.transform;
18
19 import java.io.InputStream JavaDoc;
20 import java.io.OutputStream JavaDoc;
21 import java.util.Map JavaDoc;
22
23 import org.alfresco.repo.content.MimetypeMap;
24 import org.alfresco.service.cmr.repository.ContentReader;
25 import org.alfresco.service.cmr.repository.ContentWriter;
26 import org.apache.commons.logging.Log;
27 import org.apache.commons.logging.LogFactory;
28 import org.apache.poi.hssf.usermodel.HSSFCell;
29 import org.apache.poi.hssf.usermodel.HSSFRow;
30 import org.apache.poi.hssf.usermodel.HSSFSheet;
31 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
32
33 /**
34  * Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to
35  * perform conversions from Excel spreadsheets to text (comma separated).
36  * <p>
37  * While most text extraction from spreadsheets only extract the first sheet of
38  * the workbook, the method used here extracts the text from <b>all the sheets</b>.
39  * This is more useful, especially when it comes to indexing spreadsheets.
40  * <p>
41  * In the case where there is only one sheet in the document, the results will be
42  * exactly the same as most extractors. Where there are multiple sheets, the results
43  * will differ, but meaningful reimporting of the text document is not possible
44  * anyway.
45  *
46  * @author Derek Hulley
47  */

48 public class PoiHssfContentTransformer extends AbstractContentTransformer
49 {
50     /**
51      * Windows carriage return line feed pair.
52      */

53     private static final String JavaDoc LINE_BREAK = "\r\n";
54     
55     /**
56      * Currently the only transformation performed is that of text extraction from XLS documents.
57      */

58     public double getReliability(String JavaDoc sourceMimetype, String JavaDoc targetMimetype)
59     {
60         if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) ||
61                 !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
62         {
63             // only support XLS -> Text
64
return 0.0;
65         }
66         else
67         {
68             return 1.0;
69         }
70     }
71
72     public void transformInternal(ContentReader reader, ContentWriter writer, Map JavaDoc<String JavaDoc, Object JavaDoc> options)
73             throws Exception JavaDoc
74     {
75         InputStream JavaDoc is = reader.getContentInputStream();
76         OutputStream JavaDoc os = writer.getContentOutputStream();
77         String JavaDoc encoding = writer.getEncoding();
78         try
79         {
80             // open the workbook
81
HSSFWorkbook workbook = new HSSFWorkbook(is);
82             // how many sheets are there?
83
int sheetCount = workbook.getNumberOfSheets();
84             // transform each sheet
85
for (int i = 0; i < sheetCount; i++)
86             {
87                 HSSFSheet sheet = workbook.getSheetAt(i);
88                 String JavaDoc sheetName = workbook.getSheetName(i);
89                 writeSheet(os, sheet, encoding);
90                 // write the sheet name
91
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
92                 PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true);
93                 PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
94                 PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
95             }
96         }
97         finally
98         {
99             if (is != null)
100             {
101                 try { is.close(); } catch (Throwable JavaDoc e) {}
102             }
103             if (os != null)
104             {
105                 try { os.close(); } catch (Throwable JavaDoc e) {}
106             }
107         }
108     }
109     
110     /**
111      * Dumps the text from the sheet to the stream in CSV format
112      */

113     private void writeSheet(OutputStream JavaDoc os, HSSFSheet sheet, String JavaDoc encoding) throws Exception JavaDoc
114     {
115         int rows = sheet.getLastRowNum();
116         // transform each row
117
for (int i = 0; i <= rows; i++)
118         {
119             HSSFRow row = sheet.getRow(i);
120             if (row != null)
121             {
122                 writeRow(os, row, encoding);
123             }
124             // break between rows
125
if (i < rows)
126             {
127                 PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
128             }
129         }
130     }
131     
132     private void writeRow(OutputStream JavaDoc os, HSSFRow row, String JavaDoc encoding) throws Exception JavaDoc
133     {
134         short firstCellNum = row.getFirstCellNum();
135         short lastCellNum = row.getLastCellNum();
136         // pad out to first cell
137
for (short i = 0; i < firstCellNum; i++)
138         {
139             PoiHssfContentTransformer.writeString(os, encoding, ",", false); // CSV up to first cell
140
}
141         // write each cell
142
for (short i = 0; i <= lastCellNum; i++)
143         {
144             HSSFCell cell = row.getCell(i);
145             if (cell != null)
146             {
147                 StringBuilder JavaDoc sb = new StringBuilder JavaDoc(10);
148                 switch (cell.getCellType())
149                 {
150                     case HSSFCell.CELL_TYPE_BLANK:
151                         // ignore
152
break;
153                     case HSSFCell.CELL_TYPE_BOOLEAN:
154                         sb.append(cell.getBooleanCellValue());
155                         break;
156                     case HSSFCell.CELL_TYPE_ERROR:
157                         sb.append("ERROR");
158                         break;
159                     case HSSFCell.CELL_TYPE_FORMULA:
160                         double dataNumber = cell.getNumericCellValue();
161                         if (Double.isNaN(dataNumber))
162                         {
163                             // treat it as a string
164
sb.append(cell.getStringCellValue());
165                         }
166                         else
167                         {
168                             // treat it as a number
169
sb.append(dataNumber);
170                         }
171                         break;
172                     case HSSFCell.CELL_TYPE_NUMERIC:
173                         sb.append(cell.getNumericCellValue());
174                         break;
175                     case HSSFCell.CELL_TYPE_STRING:
176                         sb.append(cell.getStringCellValue());
177                         break;
178                     default:
179                         throw new RuntimeException JavaDoc("Unknown HSSF cell type: " + cell);
180                 }
181                 String JavaDoc data = sb.toString();
182                 PoiHssfContentTransformer.writeString(os, encoding, data, true);
183             }
184             // comma separate if required
185
if (i < lastCellNum)
186             {
187                 PoiHssfContentTransformer.writeString(os, encoding, ",", false);
188             }
189         }
190     }
191     
192     /**
193      * Writes the given data to the stream using the encoding specified. If the encoding
194      * is not given, the default <tt>String</tt> to <tt>byte[]</tt> conversion will be
195      * used.
196      * <p>
197      * The given data string will be escaped appropriately.
198      *
199      * @param os the stream to write to
200      * @param encoding the encoding to use, or null if the default encoding is acceptable
201      * @param value the string to write
202      * @param isData true if the value represents a human-readable string, false if the
203      * value represents formatting characters, separating characters, etc.
204      * @throws Exception
205      */

206     public static void writeString(OutputStream JavaDoc os, String JavaDoc encoding, String JavaDoc value, boolean isData) throws Exception JavaDoc
207     {
208         if (value == null)
209         {
210             // nothing to do
211
return;
212         }
213         int dataLength = value.length();
214         if (dataLength == 0)
215         {
216             // nothing to do
217
return;
218         }
219         
220         // escape the string
221
StringBuilder JavaDoc sb = new StringBuilder JavaDoc(dataLength + 5); // slightly longer than the data
222
for (int i = 0; i < dataLength; i++)
223         {
224             char currentChar = value.charAt(i);
225             if (currentChar == '\"') // inverted commas
226
{
227                 sb.append("\""); // CSV escaping of inverted commas
228
}
229             // append the char
230
sb.append(currentChar);
231         }
232         // enclose in inverted commas for safety
233
if (isData)
234         {
235             sb.insert(0, "\"");
236             sb.append("\"");
237         }
238         // escaping complete
239
value = sb.toString();
240         
241         byte[] bytes = null;
242         if (encoding == null)
243         {
244             // use default encoding
245
bytes = value.getBytes();
246         }
247         else
248         {
249             bytes = value.getBytes(encoding);
250         }
251         // write to the stream
252
os.write(bytes);
253         // done
254
}
255 }
256
Popular Tags