KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > snapper > parsers > MsgParser


1 package org.enhydra.snapper.parsers;
2
3 /**
4  * <p>Title: </p>
5  * <p>Description: </p>
6  * <p>Copyright: Copyright (c) 2005</p>
7  * <p>Company: </p>
8  * @author not attributable
9  * @version 1.0
10  */

11
12
13 import java.io.File JavaDoc;
14 import java.io.InputStream JavaDoc;
15 import java.io.FileInputStream JavaDoc;
16
17 import java.util.Iterator JavaDoc;
18 import java.util.Map JavaDoc;
19
20 import org.textmining.text.extraction.WordExtractor;
21
22 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
23 import org.apache.poi.poifs.filesystem.DirectoryEntry;
24 import org.apache.poi.poifs.filesystem.Entry;
25 import org.apache.poi.poifs.filesystem.DocumentEntry;
26 import org.apache.poi.poifs.filesystem.DocumentNode;
27 import org.apache.poi.poifs.filesystem.DocumentInputStream;
28 import org.pdfbox.pdmodel.PDDocument;
29 import org.pdfbox.util.PDFTextStripper;
30
31 public class MsgParser
32     implements org.enhydra.snapper.api.Parser {
33   static final private String JavaDoc[] propertiesParsed = {
34       "0037", "1000", "3703", "3704"};
35   //static final private String[] attachmentsParsed = {
36
// "application/msword"};
37
static final private int propertiesNumber = propertiesParsed.length;
38   //static final private int attachmentsNumber = attachmentsParsed.length;
39
static final private String JavaDoc ATTACH_MIME = "370E";
40   static final private String JavaDoc ATTACH_DATA = "3701";
41   static final private String JavaDoc ASCII = "001E";
42   static final private String JavaDoc UNICODE = "001F";
43   static final private String JavaDoc TITLE = "0037";
44   static final private String JavaDoc NULL_STRING = "";
45   static final private String JavaDoc NAMEBG = "__substg1.";
46   static final private char[] FORBID = {
47       (char) 0};
48   //static final private char [] FORBID= {};
49
private String JavaDoc fileName;
50   private String JavaDoc parsedText = NULL_STRING;
51   private boolean titleNotSet;
52   public String JavaDoc title;
53   private String JavaDoc properties = NULL_STRING;
54   public Map JavaDoc customproperties;
55
56   public void parse() {
57     try {
58       titleNotSet = true;
59       FileInputStream JavaDoc fis = new FileInputStream JavaDoc(fileName);
60       POIFSFileSystem fs;
61       fs = new POIFSFileSystem(fis);
62       DirectoryEntry root = fs.getRoot();
63       DirectoryEntry dir = root;
64       parsedText = directoryParse(dir);
65       dir = null;
66       root = null;
67       fs = null;
68       fis.close();
69       fis = null;
70     }
71     catch (Exception JavaDoc e) {
72         ParserManager.logger.debug("*** File could not be parsed within " + fileName);
73      // e.printStackTrace();
74
}
75
76   }
77
78   private String JavaDoc parseStream(InputStream JavaDoc stream) throws java.io.IOException JavaDoc {
79     POIFSFileSystem fs;
80     fs = new POIFSFileSystem(stream);
81     DirectoryEntry root = fs.getRoot();
82     DirectoryEntry dir = root;
83     return directoryParse(dir);
84   }
85
86   private String JavaDoc directoryParse(DirectoryEntry dir) throws java.io.IOException JavaDoc {
87     String JavaDoc attachMime = NULL_STRING;
88     String JavaDoc textFound = NULL_STRING;
89     for (Iterator JavaDoc iter = dir.getEntries(); iter.hasNext(); ) {
90       Entry entry = (Entry) iter.next();
91       String JavaDoc entryName = entry.getName();
92
93       if (entry instanceof DirectoryEntry) {
94         //System.out.println("DEN: " + entryName);
95
// .. recurse into this directory
96
textFound = textFound + directoryParse( (DirectoryEntry) entry);
97       }
98       else if (entry instanceof DocumentEntry) {
99         // entry is a document, which you can read
100
String JavaDoc property = NULL_STRING;
101         String JavaDoc type = NULL_STRING;
102         if (entryName.startsWith(NAMEBG)) {
103           property = entryName.substring(12, 16);
104           type = entryName.substring(16, 20);
105         }
106
107         //System.out.println("FEN: " + entryName + " " + property + " " + type);
108
/*
109                  String x=readDoc( (DocumentNode,type) entry);
110                  System.out.println("CON: " + x);
111          */

112         for (int i = 0; i < propertiesNumber; i++) {
113           if (property.equals(propertiesParsed[i])) {
114             String JavaDoc text = readDoc( (DocumentNode) entry, type);
115             if (property.equals(TITLE) && titleNotSet) {
116               title = TextFilter.filterForbiddenCharacters(text, FORBID);
117               titleNotSet = false;
118             }
119             textFound = textFound + text + " ";
120             // System.out.println("CON: " + text);
121
break;
122           }
123           else {
124             if (property.equals(ATTACH_MIME)) {
125               attachMime = readDoc( (DocumentNode) entry, type).trim();
126               //System.out.println("ATT: " + attachMime);
127
break;
128             }
129             else if (property.equals(ATTACH_DATA)) {
130               String JavaDoc attachText = parseAttachment( (DocumentNode) entry,
131                                                   attachMime);
132               textFound = textFound + attachText;
133               //System.out.println("ATD: " + attachText + " - " + attachMime);
134
break;
135             }
136           }
137         }
138       }
139       else {
140         // currently, either an Entry is a DirectoryEntry or a DocumentEntry,
141
// but in the future, there may be other entry subinterfaces. The
142
// internal data structure certainly allows for a lot more entry types.
143
}
144     }
145     return TextFilter.filterForbiddenCharacters(textFound, FORBID);
146   }
147
148   private String JavaDoc readDoc(DocumentNode document, String JavaDoc type) throws java.io.
149
JavaDoc      IOException {
150     int size = document.getSize();
151     DocumentInputStream stream = new DocumentInputStream(
152         document);
153     byte[] buf = new byte[size];
154     int nRead = stream.read(buf);
155     /*
156          for(int i=0;i<nRead;i++){
157       System.out.println("CHR: "+i+" "+buf[i] );
158          }
159      */

160     //System.out.println("TYP: "+type+" "+ASCII);
161
if (type.equals(ASCII)) {
162       return new String JavaDoc(buf);
163     }
164     else if (type.equals(UNICODE)) {
165       return new String JavaDoc(buf, "UTF-16LE");
166     }
167     else {
168       return NULL_STRING;
169     }
170
171   }
172
173   private String JavaDoc parseAttachment(DocumentNode doc, String JavaDoc attachMime) throws
174       java.io.IOException JavaDoc {
175     DocumentInputStream stream = new DocumentInputStream(doc);
176     if (attachMime.equals("application/msword")) {
177       //System.out.println("PRW");
178
return parseWord(stream);
179     }
180     if (attachMime.equals("application/vnd.ms-excel")) {
181         //System.out.println("PRW");
182
return parseExcel(stream);
183       }
184     else if (attachMime.equals("application/octet-stream")) {
185       //System.out.println("PRO");
186
return parseStream(stream);
187     }
188     else if (attachMime.equals("application/pdf")) {
189       //System.out.println("PRO");
190
return parsePdf(stream);
191     }else if (attachMime.equals("application/vnd.ms-powerpoint")) {
192       //System.out.println("PWP");
193
PowerParser powerParser=new PowerParser();
194       return powerParser.parse(stream);
195     }
196     return NULL_STRING;
197   }
198   
199   private String JavaDoc parseExcel(InputStream JavaDoc stream) throws java.io.IOException JavaDoc {
200       //DocumentInputStream stream = new DocumentInputStream(doc);
201
try {
202         ExcelParser ep = new ExcelParser();
203         ep.parse(stream);
204         return ep.getParsedText();
205   
206       }
207       catch (Exception JavaDoc e) {
208           ParserManager.logger.debug("*** File could not be parsed within " + fileName);
209         return NULL_STRING;
210       }
211     }
212
213   private String JavaDoc parseWord(InputStream JavaDoc stream) throws java.io.IOException JavaDoc {
214     //DocumentInputStream stream = new DocumentInputStream(doc);
215
try {
216       return new WordExtractor().extractText(stream);
217     }
218     catch (Exception JavaDoc e) {
219         ParserManager.logger.debug("*** File could not be parsed within " + fileName);
220       return NULL_STRING;
221     }
222   }
223
224   private String JavaDoc parsePdf(InputStream JavaDoc stream) throws java.io.IOException JavaDoc {
225     try {
226       PDDocument document = null;
227       String JavaDoc text = null;
228       document = PDDocument.load( (InputStream JavaDoc) stream);
229       PDFTextStripper stripper = new PDFTextStripper();
230       String JavaDoc pdfFile = null;
231       String JavaDoc textFile = null;
232       int startPage = 1;
233       int endPage = Integer.MAX_VALUE;
234       if (!document.isEncrypted()) {
235         stripper.setStartPage(startPage);
236         stripper.setEndPage(endPage);
237         text = stripper.getText(document);
238       }
239       document.close();
240       return text;
241     }
242     catch (Exception JavaDoc e) {
243         ParserManager.logger.debug("*** File could not be parsed within " + fileName);
244       return NULL_STRING;
245     }
246   }
247
248   public void setFileName(String JavaDoc fileName) {
249     this.fileName = fileName;
250   }
251
252   public String JavaDoc getParsedText() {
253     return parsedText;
254   }
255
256   public String JavaDoc getTitle() {
257     return title;
258   }
259
260   public String JavaDoc getProperties() {
261     return properties;
262   }
263   
264   public String JavaDoc parse(InputStream JavaDoc is) throws java.io.IOException JavaDoc{ return "";}
265
266 }
Popular Tags