1 15 package net.nutch.parse.msword; 16 17 import org.apache.poi.hpsf.*; 18 import org.apache.poi.hwpf.model.*; 19 import org.apache.poi.hwpf.sprm.*; 20 import org.apache.poi.poifs.eventfilesystem.*; 21 import org.apache.poi.poifs.filesystem.*; 22 import org.apache.poi.util.LittleEndian; 23 24 import java.util.*; 25 import java.io.*; 26 27 36 public class WordExtractor 37 { 38 39 42 public WordExtractor() 43 { 44 } 45 46 51 public String extractText(InputStream in) throws Exception 52 { 53 ArrayList text = new ArrayList(); 54 POIFSFileSystem fsys = new POIFSFileSystem(in); 55 56 DocumentEntry headerProps = 58 (DocumentEntry)fsys.getRoot().getEntry("WordDocument"); 59 DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); 60 byte[] header = new byte[headerProps.getSize()]; 61 62 63 din.read(header); 64 din.close(); 65 66 int info = LittleEndian.getShort(header, 0xa); 67 if ((info & 0x4) != 0) 68 { 69 throw new FastSavedException("Fast-saved files are unsupported at this time"); 70 } 71 if ((info & 0x100) != 0) 72 { 73 throw new PasswordProtectedException("This document is password protected"); 74 } 75 76 int nFib = LittleEndian.getShort(header, 0x2); 78 switch (nFib) 79 { 80 case 101: 81 case 102: 82 case 103: 83 case 104: 84 Word6Extractor oldExtractor = new Word6Extractor(); 86 return oldExtractor.extractText(header); 87 } 88 89 boolean useTable1 = (info & 0x200) != 0; 91 92 int complexOffset = LittleEndian.getInt(header, 0x1a2); 94 95 String tableName = null; 97 if (useTable1) 98 { 99 tableName = "1Table"; 100 } 101 else 102 { 103 tableName = "0Table"; 104 } 105 106 DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName); 107 byte[] tableStream = new byte[table.getSize()]; 108 109 din = fsys.createDocumentInputStream(tableName); 110 111 din.read(tableStream); 112 din.close(); 113 114 int chpOffset = LittleEndian.getInt(header, 0xfa); 115 int chpSize = LittleEndian.getInt(header, 0xfe); 116 int fcMin = LittleEndian.getInt(header, 0x18); 117 CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); 118 119 ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); 121 TextPieceTable tpt = cft.getTextPieceTable(); 122 List textPieces = tpt.getTextPieces(); 123 124 din = null; 126 fsys = null; 127 table = null; 128 headerProps = null; 129 130 List textRuns = cbt.getTextRuns(); 131 Iterator runIt = textRuns.iterator(); 132 Iterator textIt = textPieces.iterator(); 133 134 TextPiece currentPiece = (TextPiece)textIt.next(); 135 int currentTextStart = currentPiece.getStart(); 136 int currentTextEnd = currentPiece.getEnd(); 137 138 WordTextBuffer finalTextBuf = new WordTextBuffer(); 139 140 while (runIt.hasNext()) 143 { 144 CHPX chpx = (CHPX)runIt.next(); 145 boolean deleted = isDeleted(chpx.getGrpprl()); 146 if (deleted) 147 { 148 continue; 149 } 150 151 int runStart = chpx.getStart(); 152 int runEnd = chpx.getEnd(); 153 154 while (runStart >= currentTextEnd) 155 { 156 currentPiece = (TextPiece) textIt.next (); 157 currentTextStart = currentPiece.getStart (); 158 currentTextEnd = currentPiece.getEnd (); 159 } 160 161 if (runEnd < currentTextEnd) 162 { 163 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); 164 finalTextBuf.append(str); 165 } 166 else if (runEnd > currentTextEnd) 167 { 168 while (runEnd > currentTextEnd) 169 { 170 String str = currentPiece.substring(runStart - currentTextStart, 171 currentTextEnd - currentTextStart); 172 finalTextBuf.append(str); 173 if (textIt.hasNext()) 174 { 175 currentPiece = (TextPiece) textIt.next (); 176 currentTextStart = currentPiece.getStart (); 177 runStart = currentTextStart; 178 currentTextEnd = currentPiece.getEnd (); 179 } 180 else 181 { 182 return finalTextBuf.toString(); 183 } 184 } 185 String str = currentPiece.substring(0, runEnd - currentTextStart); 186 finalTextBuf.append(str); 187 } 188 else 189 { 190 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); 191 if (textIt.hasNext()) 192 { 193 currentPiece = (TextPiece) textIt.next(); 194 currentTextStart = currentPiece.getStart(); 195 currentTextEnd = currentPiece.getEnd(); 196 } 197 finalTextBuf.append(str); 198 } 199 } 200 return finalTextBuf.toString(); 201 } 202 203 209 private boolean isDeleted(byte[] grpprl) 210 { 211 SprmIterator iterator = new SprmIterator(grpprl,0); 212 while (iterator.hasNext()) 213 { 214 SprmOperation op = iterator.next(); 215 if (op.getOperation() == 0 && op.getOperand() != 0) 217 { 218 return true; 219 } 220 } 221 return false; 222 } 223 224 public Properties extractProperties(InputStream in) 225 throws IOException { 226 227 PropertiesBroker propertiesBroker = new PropertiesBroker(); 228 POIFSReader reader = new POIFSReader(); 229 reader.registerListener(new PropertiesReaderListener(propertiesBroker), 230 "\005SummaryInformation"); 231 reader.read(in); 232 return propertiesBroker.getProperties(); 233 } 234 235 class PropertiesReaderListener 236 implements POIFSReaderListener { 237 238 private PropertiesBroker propertiesBroker; 239 private Properties metaData = new Properties(); 240 241 public PropertiesReaderListener(PropertiesBroker propertiesBroker) { 242 this.propertiesBroker = propertiesBroker; 243 } 244 245 public void processPOIFSReaderEvent(POIFSReaderEvent event) { 246 247 SummaryInformation si = null; 248 Properties properties = new Properties(); 249 250 try { 251 si = (SummaryInformation)PropertySetFactory.create(event.getStream()); 252 } catch (Exception ex) { 253 properties = null; 254 } 255 256 Date tmp = null; 257 258 String title = si.getTitle(); 259 String applicationName = si.getApplicationName(); 260 String author = si.getAuthor(); 261 int charCount = si.getCharCount(); 262 String comments = si.getComments(); 263 Date createDateTime = si.getCreateDateTime(); 264 long editTime = si.getEditTime(); 265 String keywords = si.getKeywords(); 266 String lastAuthor = si.getLastAuthor(); 267 Date lastPrinted = si.getLastPrinted(); 268 Date lastSaveDateTime = si.getLastSaveDateTime(); 269 int pageCount = si.getPageCount(); 270 String revNumber = si.getRevNumber(); 271 int security = si.getSecurity(); 272 String subject = si.getSubject(); 273 String template = si.getTemplate(); 274 int wordCount = si.getWordCount(); 275 276 278 if(title != null) 279 properties.setProperty("Title", title); 280 if(applicationName != null) 281 properties.setProperty("Application-Name", applicationName); 282 if(author != null) 283 properties.setProperty("Author", author); 284 if(charCount != 0) 285 properties.setProperty("Character Count", charCount + ""); 286 if(comments != null) 287 properties.setProperty("Comments", comments); 288 if(createDateTime != null) 289 properties.setProperty("Creation-Date", createDateTime.getTime() + ""); 290 if(editTime != 0) 291 properties.setProperty("Edit-Time", editTime + ""); 292 if(keywords != null) 293 properties.setProperty("Keywords", keywords); 294 if(lastAuthor != null) 295 properties.setProperty("Last-Author", lastAuthor); 296 if(lastPrinted != null) 297 properties.setProperty("Last-Printed", lastPrinted.getTime() + ""); 298 if(lastSaveDateTime != null) 299 properties.setProperty("Last-Save-Date", lastSaveDateTime.getTime() + ""); 300 if(pageCount != 0) 301 properties.setProperty("Page-Count", pageCount + ""); 302 if(revNumber != null) 303 properties.setProperty("Revision-Number", revNumber); 304 if(security != 0) 305 properties.setProperty("Security", security + ""); 306 if(subject != null) 307 properties.setProperty("Subject", subject); 308 if(template != null) 309 properties.setProperty("Template", template); 310 if(wordCount != 0) 311 properties.setProperty("Word-Count", wordCount + ""); 312 propertiesBroker.setProperties(properties); 313 314 } 316 } 317 318 class PropertiesBroker { 319 320 private Properties properties; 321 private int timeoutMillis = 2 * 1000; 322 323 324 public synchronized Properties getProperties() { 325 326 long start = new Date().getTime(); 327 long now = start; 328 329 while (properties == null && now - start < timeoutMillis) { 330 try { 331 wait(timeoutMillis / 10); 332 } catch (InterruptedException e) {} 333 now = new Date().getTime(); 334 } 335 336 notifyAll(); 337 338 return properties; 339 } 340 341 public synchronized void setProperties(Properties properties) { 342 this.properties = properties; 343 notifyAll(); 344 } 345 } 346 } 347 348 | Popular Tags |