KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > SnowMailClient > html > HTMLTextExtractor


1 package SnowMailClient.html;
2
3 import snow.utils.storage.*;
4
5 import java.text.*;
6 import javax.swing.text.*;
7 import javax.swing.text.html.*;
8 import javax.swing.text.html.parser.*;
9 import java.io.*;
10 import java.util.*;
11
12 /** Use java embedded HTML parser with our handler implementation HTMLEditorKit.ParserCallback
13     to extract the text of a html page,
14     also make a list of image references,
15     hrefs, font faces and unknown tags.
16
17 */

18 public final class HTMLTextExtractor extends HTMLEditorKit.ParserCallback
19 {
20   // the text only
21
private StringBuffer JavaDoc textOnly = new StringBuffer JavaDoc();
22   boolean includePicturesAndReferences = false;
23
24   private Vector<String JavaDoc> unknownTags = new Vector<String JavaDoc>();
25   private Vector<String JavaDoc> aHrefs = new Vector<String JavaDoc>();
26   private Vector<String JavaDoc> images = new Vector<String JavaDoc>();
27   private Vector<String JavaDoc> fonts = new Vector<String JavaDoc>();
28
29   private Vector<String JavaDoc> scriptsTagInfos = new Vector<String JavaDoc>();
30   private Vector<String JavaDoc> metaTagInfos = new Vector<String JavaDoc>();
31   private Vector<String JavaDoc> linkTagInfos = new Vector<String JavaDoc>();
32
33   // internal data
34

35   int newLines = 0;
36   int indent = 0;
37
38
39   /** @param includePicturesAndReferences should be false
40      when parsing words for spam, because the pics and refs are put in the
41      words DB with semantics (url*...) and get from this class with getLinks...
42   */

43   public HTMLTextExtractor(String JavaDoc cont, boolean includePicturesAndReferences) throws Exception JavaDoc
44   {
45       this.includePicturesAndReferences = includePicturesAndReferences;
46       ParserDelegator pd = new ParserDelegator();
47
48       StringReader r = new StringReader(cont);
49       pd.parse(r,
50         this,
51         true); // don't stop when charset changes !!! almost every pages cause parser crash if false
52

53   } // Constructor
54

55   // Parse results
56
//
57

58   public String JavaDoc getTextOnly() { return textOnly.toString(); }
59   public Vector<String JavaDoc> getUnknownTags() { return unknownTags; }
60
61   /** the a href references
62   */

63   public Vector<String JavaDoc> getLinksHREFs() { return aHrefs; }
64
65   /** the img src names
66   */

67   public Vector<String JavaDoc> getImageSrcs() { return images; }
68   public Vector<String JavaDoc> getFontFaces() { return fonts; }
69
70   public Vector<String JavaDoc> getScriptTagInfos() { return scriptsTagInfos; }
71   public Vector<String JavaDoc> getMetaTagInfos() { return metaTagInfos; }
72   public Vector<String JavaDoc> getLinkTagInfos() { return linkTagInfos; }
73
74
75   // Parser's handles...
76
//
77

78   public void handleText(char[] data, int pos)
79   {
80     if(newLines>0)
81     {
82       // maximum two new lines
83
for(int i=0; i<newLines; i++)
84       {
85          textOnly.append("\r\n");
86          if(i==1) break;
87       }
88
89       // indentation
90
for(int i=0; i<indent; i++)
91       {
92         textOnly.append(" ");
93       }
94       newLines=0;
95     }
96     textOnly.append(new String JavaDoc(data));
97   }
98
99   public void handleComment(char[] data, int pos)
100   {
101     //System.out.println("COMMENT "+new String(data));
102
}
103
104   public void handleEndOfLineString(String JavaDoc eol)
105   {
106     // seems to happend only at the end of the file
107
//textOnly.append("EOL");
108
}
109
110   public void handleError(String JavaDoc errorMsg, int pos)
111   {
112     // a lot of errors are encountered
113
//textOnly.append(" [Error "+errorMsg+"] ");
114
}
115
116   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
117   {
118     if(t==HTML.Tag.BR || t==HTML.Tag.P)
119     {
120       //textOnly.append("\r\n");
121
newLines++;
122     }
123     else if(t==HTML.Tag.META)
124     {
125       //System.out.println("META");
126
}
127     else if(t==HTML.Tag.LINK)
128     {
129       //ignore
130
}
131     else if (t==HTML.Tag.HR)
132     {
133       //ignore ### horizontal rule
134
}
135     else if(t==HTML.Tag.IMG)
136     {
137       String JavaDoc src = (String JavaDoc) a.getAttribute(HTML.Attribute.SRC);
138       if(src!=null)
139       {
140         images.add(src);
141         if(includePicturesAndReferences)
142         {
143           textOnly.append(" [image "+src+"] ");
144         }
145       }
146     }
147     else
148     {
149       // unknown tags...
150
if(t.toString().equals("tbody"))
151       {
152         // ignore
153
}
154       else
155       {
156         //System.out.println("Unknown simple Tag "+t);
157
unknownTags.add(""+t);
158       }
159     }
160   }
161
162
163
164   public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
165   {
166     //System.out.println(""+t);
167

168     if(t==HTML.Tag.P)
169     {
170       newLines++;
171     }
172     else if(t==HTML.Tag.UL || t==HTML.Tag.OL)
173     {
174       indent += 5;
175     }
176     else if(t==HTML.Tag.LI)
177     {
178       newLines++;
179     }
180     else if(t==HTML.Tag.CENTER)
181     {
182       newLines++;
183       indent+=10;
184     }
185     else if(t==HTML.Tag.BLOCKQUOTE)
186     {
187       newLines++;
188       indent+=5;
189     }
190     else if(t==HTML.Tag.SCRIPT)
191     {
192       //System.out.println("SCRIPT ! ");
193
}
194     else if(t==HTML.Tag.TITLE || t==HTML.Tag.H1 || t==HTML.Tag.H2 || t==HTML.Tag.H3
195          || t==HTML.Tag.H4 || t==HTML.Tag.H5 || t==HTML.Tag.H6
196          || t==HTML.Tag.TABLE
197          || t==HTML.Tag.TR )
198     {
199       newLines++;
200     }
201     else if(t==HTML.Tag.TD)
202     {
203       textOnly.append("\t");
204     }
205     else if(t==HTML.Tag.A)
206     {
207       String JavaDoc ref = (String JavaDoc) a.getAttribute(HTML.Attribute.HREF);
208       if(ref!=null)
209       {
210         this.aHrefs.add(ref);
211         if(includePicturesAndReferences)
212         {
213           textOnly.append(" [link "+ref+"] ");
214         }
215       }
216     }
217     else if(t==HTML.Tag.FONT)
218     {
219       String JavaDoc face = (String JavaDoc) a.getAttribute(HTML.Attribute.FACE);
220       if(face!=null)
221       {
222         this.fonts.add(face);
223       }
224     }
225     else if(t==HTML.Tag.B || t==HTML.Tag.I || t==HTML.Tag.U
226          || t==HTML.Tag.BODY || t==HTML.Tag.PRE
227          || t==HTML.Tag.HTML || t==HTML.Tag.HEAD
228          || t==HTML.Tag.SUP || t==HTML.Tag.SUB
229          || t==HTML.Tag.CODE )
230     {
231       // just ignore
232
}
233     else
234     {
235       //System.out.println("Unknown start tag "+t);
236
unknownTags.add(""+t);
237     }
238
239     //if(t.breaksFlow()) textOnly.append("\r\n");
240
}
241
242   public void handleEndTag(HTML.Tag t, int pos)
243   {
244     if(t==HTML.Tag.TITLE || t==HTML.Tag.H1 || t==HTML.Tag.H2 || t==HTML.Tag.H3
245        || t==HTML.Tag.H4 || t==HTML.Tag.H5 || t==HTML.Tag.H6)
246     {
247       // new line
248
newLines ++;
249       //textOnly.append("\r\n");
250
}
251     else if(t==HTML.Tag.UL || t==HTML.Tag.OL)
252     {
253       indent -= 5;
254       newLines ++;
255     }
256     else if(t==HTML.Tag.BLOCKQUOTE)
257     {
258       indent -= 5;
259       newLines ++;
260     }
261     else if(t==HTML.Tag.CENTER)
262     {
263       indent -= 10;
264       newLines ++;
265     }
266     else if(t==HTML.Tag.P
267         || t==HTML.Tag.UL || t==HTML.Tag.OL
268         || t==HTML.Tag.PRE || t==HTML.Tag.TABLE )
269     {
270       newLines ++;
271     }
272     else if(t==HTML.Tag.TR || t==HTML.Tag.TD)
273     {
274       // ignore
275
}
276     else if(t==HTML.Tag.LI || t==HTML.Tag.A || t==HTML.Tag.HEAD
277        || t==HTML.Tag.B || t==HTML.Tag.I || t==HTML.Tag.U
278        || t==HTML.Tag.BODY || t==HTML.Tag.HTML
279        || t==HTML.Tag.FONT || t==HTML.Tag.BASEFONT
280        || t==HTML.Tag.SUP || t==HTML.Tag.SUB
281        || t==HTML.Tag.CODE)
282     {
283       // ignore
284
}
285     else
286     {
287       //System.out.println("Unknown end tag "+t);
288
unknownTags.add(""+t);
289     }
290
291     //if(t.breaksFlow()) textOnly.append("\r\n");
292
}
293
294
295
296   public static void main(String JavaDoc[] a)
297   {
298     try
299     {
300       //String cont = new String(FileUtils.getFileContent(new File("c:/data/test.htm")));
301
String JavaDoc cont = new String JavaDoc(FileUtils.getFileContent(new File("C:/sources/Schmortopf_IDE/Internet/Schmortopf/versionhistory.htm")));
302       //String cont = new String(FileUtils.getFileContent(new File("c:/proj/test.htm")));
303
HTMLTextExtractor t = new HTMLTextExtractor(cont, true);
304       System.out.println("\n========== TEXT. ===========\n"+t.getTextOnly());
305     }
306     catch(Exception JavaDoc e)
307     {
308       e.printStackTrace();
309     }
310
311   }
312 }
313  // HTMLTextExtractor
Popular Tags