PoiHssfContentTransformer


1   /*
2    * Copyright (C) 2005 Alfresco, Inc.
3    *
4    * Licensed under the Mozilla Public License version 1.1 
5    * with a permitted attribution clause. You may obtain a
6    * copy of the License at
7    *
8    *   http://www.alfresco.org/legal/license.txt
9    *
10   * Unless required by applicable law or agreed to in writing,
11   * software distributed under the License is distributed on an
12   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific
14   * language governing permissions and limitations under the
15   * License.
16   */
17  package org.alfresco.repo.content.transform;
18  
19  import java.io.InputStream  ;
20  import java.io.OutputStream  ;
21  import java.util.Map  ;
22  
23  import org.alfresco.repo.content.MimetypeMap;
24  import org.alfresco.service.cmr.repository.ContentReader;
25  import org.alfresco.service.cmr.repository.ContentWriter;
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.poi.hssf.usermodel.HSSFCell;
29  import org.apache.poi.hssf.usermodel.HSSFRow;
30  import org.apache.poi.hssf.usermodel.HSSFSheet;
31  import org.apache.poi.hssf.usermodel.HSSFWorkbook;
32  
33  /**
34   * Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to
35   * perform conversions from Excel spreadsheets to text (comma separated).
36   * <p>
37   * While most text extraction from spreadsheets only extract the first sheet of
38   * the workbook, the method used here extracts the text from <b>all the sheets</b>.
39   * This is more useful, especially when it comes to indexing spreadsheets.
40   * <p>
41   * In the case where there is only one sheet in the document, the results will be
42   * exactly the same as most extractors.  Where there are multiple sheets, the results
43   * will differ, but meaningful reimporting of the text document is not possible
44   * anyway.
45   * 
46   * @author Derek Hulley
47   */
48  public class PoiHssfContentTransformer extends AbstractContentTransformer
49  {
50      /**
51       * Windows carriage return line feed pair.
52       */
53      private static final String   LINE_BREAK = "\r\n";
54      
55      /**
56       * Currently the only transformation performed is that of text extraction from XLS documents.
57       */
58      public double getReliability(String   sourceMimetype, String   targetMimetype)
59      {
60          if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) ||
61                  !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
62          {
63              // only support XLS -> Text
64              return 0.0;
65          }
66          else
67          {
68              return 1.0;
69          }
70      }
71  
72      public void transformInternal(ContentReader reader, ContentWriter writer,  Map  <String  , Object  > options)
73              throws Exception  
74      {
75          InputStream   is = reader.getContentInputStream();
76          OutputStream   os = writer.getContentOutputStream();
77          String   encoding = writer.getEncoding();
78          try
79          {
80              // open the workbook
81              HSSFWorkbook workbook = new HSSFWorkbook(is);
82              // how many sheets are there?
83              int sheetCount = workbook.getNumberOfSheets();
84              // transform each sheet
85              for (int i = 0; i < sheetCount; i++)
86              {
87                  HSSFSheet sheet = workbook.getSheetAt(i);
88                  String   sheetName = workbook.getSheetName(i);
89                  writeSheet(os, sheet, encoding);
90                  // write the sheet name
91                  PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
92                  PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true);
93                  PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
94                  PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
95              }
96          }
97          finally
98          {
99              if (is != null)
100             {
101                 try { is.close(); } catch (Throwable   e) {}
102             }
103             if (os != null)
104             {
105                 try { os.close(); } catch (Throwable   e) {}
106             }
107         }
108     }
109     
110     /**
111      * Dumps the text from the sheet to the stream in CSV format
112      */
113     private void writeSheet(OutputStream   os, HSSFSheet sheet, String   encoding) throws Exception  
114     {
115         int rows = sheet.getLastRowNum();
116         // transform each row
117         for (int i = 0; i <= rows; i++)
118         {
119             HSSFRow row = sheet.getRow(i);
120             if (row != null)
121             {
122                 writeRow(os, row, encoding);
123             }
124             // break between rows
125             if (i < rows)
126             {
127                 PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
128             }
129         }
130     }
131     
132     private void writeRow(OutputStream   os, HSSFRow row, String   encoding) throws Exception  
133     {
134         short firstCellNum = row.getFirstCellNum(); 
135         short lastCellNum = row.getLastCellNum();
136         // pad out to first cell
137         for (short i = 0; i < firstCellNum; i++)
138         {
139             PoiHssfContentTransformer.writeString(os, encoding, ",", false);   // CSV up to first cell
140         }
141         // write each cell
142         for (short i = 0; i <= lastCellNum; i++)
143         {
144             HSSFCell cell = row.getCell(i);
145             if (cell != null)
146             {
147                 StringBuilder   sb = new StringBuilder  (10);
148                 switch (cell.getCellType())
149                 {
150                     case HSSFCell.CELL_TYPE_BLANK:
151                         // ignore
152                         break;
153                     case HSSFCell.CELL_TYPE_BOOLEAN:
154                         sb.append(cell.getBooleanCellValue());
155                         break;
156                     case HSSFCell.CELL_TYPE_ERROR:
157                         sb.append("ERROR");
158                         break;
159                     case HSSFCell.CELL_TYPE_FORMULA:
160                         double dataNumber = cell.getNumericCellValue();
161                         if (Double.isNaN(dataNumber))
162                         {
163                             // treat it as a string
164                             sb.append(cell.getStringCellValue());
165                         }
166                         else
167                         {
168                             // treat it as a number
169                             sb.append(dataNumber);
170                         }
171                         break;
172                     case HSSFCell.CELL_TYPE_NUMERIC:
173                         sb.append(cell.getNumericCellValue());
174                         break;
175                     case HSSFCell.CELL_TYPE_STRING:
176                         sb.append(cell.getStringCellValue());
177                         break;
178                     default:
179                         throw new RuntimeException  ("Unknown HSSF cell type: " + cell);
180                 }
181                 String   data = sb.toString();
182                 PoiHssfContentTransformer.writeString(os, encoding, data, true);
183             }
184             // comma separate if required
185             if (i < lastCellNum)
186             {
187                 PoiHssfContentTransformer.writeString(os, encoding, ",", false);
188             }
189         }
190     }
191     
192     /**
193      * Writes the given data to the stream using the encoding specified.  If the encoding
194      * is not given, the default <tt>String</tt> to <tt>byte[]</tt> conversion will be
195      * used.
196      * <p>
197      * The given data string will be escaped appropriately.
198      * 
199      * @param os the stream to write to
200      * @param encoding the encoding to use, or null if the default encoding is acceptable
201      * @param value the string to write
202      * @param isData true if the value represents a human-readable string, false if the
203      *      value represents formatting characters, separating characters, etc.
204      * @throws Exception
205      */
206     public static void writeString(OutputStream   os, String   encoding, String   value, boolean isData) throws Exception  
207     {
208         if (value == null)
209         {
210             // nothing to do
211             return;
212         }
213         int dataLength = value.length();
214         if (dataLength == 0)
215         {
216             // nothing to do
217             return;
218         }
219         
220         // escape the string
221         StringBuilder   sb = new StringBuilder  (dataLength + 5);   // slightly longer than the data
222         for (int i = 0; i < dataLength; i++)
223         {
224             char currentChar = value.charAt(i);
225             if (currentChar == '\"')         // inverted commas
226             {
227                 sb.append("\"");      // CSV escaping of inverted commas 
228             }
229             // append the char
230             sb.append(currentChar);
231         }
232         // enclose in inverted commas for safety
233         if (isData)
234         {
235             sb.insert(0, "\"");
236             sb.append("\"");
237         }
238         // escaping complete
239         value = sb.toString();
240         
241         byte[] bytes = null;
242         if (encoding == null)
243         {
244             // use default encoding
245             bytes = value.getBytes();
246         }
247         else
248         {
249             bytes = value.getBytes(encoding);
250         }
251         // write to the stream
252         os.write(bytes);
253         // done
254     }
255 }
256
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags