1 package org.enhydra.snapper.parsers; 2 3 11 12 13 import java.io.File ; 14 import java.io.InputStream ; 15 import java.io.FileInputStream ; 16 17 import java.util.Iterator ; 18 import java.util.Map ; 19 20 import org.textmining.text.extraction.WordExtractor; 21 22 import org.apache.poi.poifs.filesystem.POIFSFileSystem; 23 import org.apache.poi.poifs.filesystem.DirectoryEntry; 24 import org.apache.poi.poifs.filesystem.Entry; 25 import org.apache.poi.poifs.filesystem.DocumentEntry; 26 import org.apache.poi.poifs.filesystem.DocumentNode; 27 import org.apache.poi.poifs.filesystem.DocumentInputStream; 28 import org.pdfbox.pdmodel.PDDocument; 29 import org.pdfbox.util.PDFTextStripper; 30 31 public class MsgParser 32 implements org.enhydra.snapper.api.Parser { 33 static final private String [] propertiesParsed = { 34 "0037", "1000", "3703", "3704"}; 35 static final private int propertiesNumber = propertiesParsed.length; 38 static final private String ATTACH_MIME = "370E"; 40 static final private String ATTACH_DATA = "3701"; 41 static final private String ASCII = "001E"; 42 static final private String UNICODE = "001F"; 43 static final private String TITLE = "0037"; 44 static final private String NULL_STRING = ""; 45 static final private String NAMEBG = "__substg1."; 46 static final private char[] FORBID = { 47 (char) 0}; 48 private String fileName; 50 private String parsedText = NULL_STRING; 51 private boolean titleNotSet; 52 public String title; 53 private String properties = NULL_STRING; 54 public Map customproperties; 55 56 public void parse() { 57 try { 58 titleNotSet = true; 59 FileInputStream fis = new FileInputStream (fileName); 60 POIFSFileSystem fs; 61 fs = new POIFSFileSystem(fis); 62 DirectoryEntry root = fs.getRoot(); 63 DirectoryEntry dir = root; 64 parsedText = directoryParse(dir); 65 dir = null; 66 root = null; 67 fs = null; 68 fis.close(); 69 fis = null; 70 } 71 catch (Exception e) { 72 ParserManager.logger.debug("*** File could not be parsed within " + fileName); 73 } 75 76 } 77 78 private String parseStream(InputStream stream) throws java.io.IOException { 79 POIFSFileSystem fs; 80 fs = new POIFSFileSystem(stream); 81 DirectoryEntry root = fs.getRoot(); 82 DirectoryEntry dir = root; 83 return directoryParse(dir); 84 } 85 86 private String directoryParse(DirectoryEntry dir) throws java.io.IOException { 87 String attachMime = NULL_STRING; 88 String textFound = NULL_STRING; 89 for (Iterator iter = dir.getEntries(); iter.hasNext(); ) { 90 Entry entry = (Entry) iter.next(); 91 String entryName = entry.getName(); 92 93 if (entry instanceof DirectoryEntry) { 94 textFound = textFound + directoryParse( (DirectoryEntry) entry); 97 } 98 else if (entry instanceof DocumentEntry) { 99 String property = NULL_STRING; 101 String type = NULL_STRING; 102 if (entryName.startsWith(NAMEBG)) { 103 property = entryName.substring(12, 16); 104 type = entryName.substring(16, 20); 105 } 106 107 112 for (int i = 0; i < propertiesNumber; i++) { 113 if (property.equals(propertiesParsed[i])) { 114 String text = readDoc( (DocumentNode) entry, type); 115 if (property.equals(TITLE) && titleNotSet) { 116 title = TextFilter.filterForbiddenCharacters(text, FORBID); 117 titleNotSet = false; 118 } 119 textFound = textFound + text + " "; 120 break; 122 } 123 else { 124 if (property.equals(ATTACH_MIME)) { 125 attachMime = readDoc( (DocumentNode) entry, type).trim(); 126 break; 128 } 129 else if (property.equals(ATTACH_DATA)) { 130 String attachText = parseAttachment( (DocumentNode) entry, 131 attachMime); 132 textFound = textFound + attachText; 133 break; 135 } 136 } 137 } 138 } 139 else { 140 } 144 } 145 return TextFilter.filterForbiddenCharacters(textFound, FORBID); 146 } 147 148 private String readDoc(DocumentNode document, String type) throws java.io. 149 IOException { 150 int size = document.getSize(); 151 DocumentInputStream stream = new DocumentInputStream( 152 document); 153 byte[] buf = new byte[size]; 154 int nRead = stream.read(buf); 155 160 if (type.equals(ASCII)) { 162 return new String (buf); 163 } 164 else if (type.equals(UNICODE)) { 165 return new String (buf, "UTF-16LE"); 166 } 167 else { 168 return NULL_STRING; 169 } 170 171 } 172 173 private String parseAttachment(DocumentNode doc, String attachMime) throws 174 java.io.IOException { 175 DocumentInputStream stream = new DocumentInputStream(doc); 176 if (attachMime.equals("application/msword")) { 177 return parseWord(stream); 179 } 180 if (attachMime.equals("application/vnd.ms-excel")) { 181 return parseExcel(stream); 183 } 184 else if (attachMime.equals("application/octet-stream")) { 185 return parseStream(stream); 187 } 188 else if (attachMime.equals("application/pdf")) { 189 return parsePdf(stream); 191 }else if (attachMime.equals("application/vnd.ms-powerpoint")) { 192 PowerParser powerParser=new PowerParser(); 194 return powerParser.parse(stream); 195 } 196 return NULL_STRING; 197 } 198 199 private String parseExcel(InputStream stream) throws java.io.IOException { 200 try { 202 ExcelParser ep = new ExcelParser(); 203 ep.parse(stream); 204 return ep.getParsedText(); 205 206 } 207 catch (Exception e) { 208 ParserManager.logger.debug("*** File could not be parsed within " + fileName); 209 return NULL_STRING; 210 } 211 } 212 213 private String parseWord(InputStream stream) throws java.io.IOException { 214 try { 216 return new WordExtractor().extractText(stream); 217 } 218 catch (Exception e) { 219 ParserManager.logger.debug("*** File could not be parsed within " + fileName); 220 return NULL_STRING; 221 } 222 } 223 224 private String parsePdf(InputStream stream) throws java.io.IOException { 225 try { 226 PDDocument document = null; 227 String text = null; 228 document = PDDocument.load( (InputStream ) stream); 229 PDFTextStripper stripper = new PDFTextStripper(); 230 String pdfFile = null; 231 String textFile = null; 232 int startPage = 1; 233 int endPage = Integer.MAX_VALUE; 234 if (!document.isEncrypted()) { 235 stripper.setStartPage(startPage); 236 stripper.setEndPage(endPage); 237 text = stripper.getText(document); 238 } 239 document.close(); 240 return text; 241 } 242 catch (Exception e) { 243 ParserManager.logger.debug("*** File could not be parsed within " + fileName); 244 return NULL_STRING; 245 } 246 } 247 248 public void setFileName(String fileName) { 249 this.fileName = fileName; 250 } 251 252 public String getParsedText() { 253 return parsedText; 254 } 255 256 public String getTitle() { 257 return title; 258 } 259 260 public String getProperties() { 261 return properties; 262 } 263 264 public String parse(InputStream is) throws java.io.IOException { return "";} 265 266 } | Popular Tags |