1 15 package org.textmining.text.extraction; 16 17 import org.apache.poi.poifs.filesystem.*; 18 import org.apache.poi.util.LittleEndian; 19 import org.apache.poi.hwpf.model.*; 20 import org.textmining.text.extraction.sprm.*; 21 22 import java.util.*; 23 import java.io.*; 24 25 30 public class WordExtractor 31 { 32 33 36 public WordExtractor() 37 { 38 } 39 40 45 public String extractText(InputStream in) throws Exception 46 { 47 WordTextBuffer finalTextBuf = null; 48 try{ 49 ArrayList text = new ArrayList(); 50 POIFSFileSystem fsys = new POIFSFileSystem(in); 51 52 DocumentEntry headerProps = 54 (DocumentEntry)fsys.getRoot().getEntry("WordDocument"); 55 DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); 56 byte[] header = new byte[headerProps.getSize()]; 57 58 59 din.read(header); 60 din.close(); 61 62 int info = LittleEndian.getShort(header, 0xa); 63 if ((info & 0x4) != 0) 64 { 65 throw new FastSavedException("Fast-saved files are unsupported at this time"); 66 } 67 if ((info & 0x100) != 0) 68 { 69 throw new PasswordProtectedException("This document is password protected"); 70 } 71 72 int nFib = LittleEndian.getShort(header, 0x2); 74 switch (nFib) 75 { 76 case 101: 77 case 102: 78 case 103: 79 case 104: 80 Word6Extractor oldExtractor = new Word6Extractor(); 82 return oldExtractor.extractText(header); 83 } 84 85 boolean useTable1 = (info & 0x200) != 0; 87 88 int complexOffset = LittleEndian.getInt(header, 0x1a2); 90 91 String tableName = null; 93 if (useTable1) 94 { 95 tableName = "1Table"; 96 } 97 else 98 { 99 tableName = "0Table"; 100 } 101 102 DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName); 103 byte[] tableStream = new byte[table.getSize()]; 104 105 din = fsys.createDocumentInputStream(tableName); 106 107 din.read(tableStream); 108 din.close(); 109 110 int chpOffset = LittleEndian.getInt(header, 0xfa); 111 int chpSize = LittleEndian.getInt(header, 0xfe); 112 int fcMin = LittleEndian.getInt(header, 0x18); 113 CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); 114 115 ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); 117 TextPieceTable tpt = cft.getTextPieceTable(); 118 List textPieces = tpt.getTextPieces(); 119 120 din = null; 122 fsys = null; 123 table = null; 124 headerProps = null; 125 126 List textRuns = cbt.getTextRuns(); 127 Iterator runIt = textRuns.iterator(); 128 Iterator textIt = textPieces.iterator(); 129 130 TextPiece currentPiece = (TextPiece)textIt.next(); 131 int currentTextStart = currentPiece.getStart(); 132 int currentTextEnd = currentPiece.getEnd(); 133 134 finalTextBuf = new WordTextBuffer(); 135 136 while (runIt.hasNext()) 139 { 140 CHPX chpx = (CHPX)runIt.next(); 141 boolean deleted = isDeleted(chpx.getGrpprl()); 142 if (deleted) 143 { 144 continue; 145 } 146 147 int runStart = chpx.getStart(); 148 int runEnd = chpx.getEnd(); 149 150 while (runStart >= currentTextEnd) 151 { 152 currentPiece = (TextPiece) textIt.next (); 153 currentTextStart = currentPiece.getStart (); 154 currentTextEnd = currentPiece.getEnd (); 155 } 156 157 if (runEnd < currentTextEnd) 158 { 159 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); 160 finalTextBuf.append(str); 161 } 162 else if (runEnd > currentTextEnd) 163 { 164 while (runEnd > currentTextEnd) 165 { 166 String str = currentPiece.substring(runStart - currentTextStart, 167 currentTextEnd - currentTextStart); 168 finalTextBuf.append(str); 169 if (textIt.hasNext()) 170 { 171 currentPiece = (TextPiece) textIt.next (); 172 currentTextStart = currentPiece.getStart (); 173 runStart = currentTextStart; 174 currentTextEnd = currentPiece.getEnd (); 175 } 176 else 177 { 178 return finalTextBuf.toString(); 179 } 180 } 181 String str = currentPiece.substring(0, runEnd - currentTextStart); 182 finalTextBuf.append(str); 183 } 184 else 185 { 186 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); 187 if (textIt.hasNext()) 188 { 189 currentPiece = (TextPiece) textIt.next(); 190 currentTextStart = currentPiece.getStart(); 191 currentTextEnd = currentPiece.getEnd(); 192 } 193 finalTextBuf.append(str); 194 } 195 } 196 return finalTextBuf.toString(); 197 198 199 } catch (Throwable e){ 200 return finalTextBuf.toString(); 201 } 202 } 203 204 210 private boolean isDeleted(byte[] grpprl) 211 { 212 SprmIterator iterator = new SprmIterator(grpprl); 213 while (iterator.hasNext()) 214 { 215 SprmOperation op = iterator.next(); 216 if (op.getOperation() == 0 && op.getOperand() != 0) 218 { 219 return true; 220 } 221 } 222 return false; 223 } 224 } | Popular Tags |