HTMLTextExtractor


1   package SnowMailClient.html;
2   
3   import snow.utils.storage.*;
4   
5   import java.text.*;
6   import javax.swing.text.*;
7   import javax.swing.text.html.*;
8   import javax.swing.text.html.parser.*;
9   import java.io.*;
10  import java.util.*;
11  
12  /** Use java embedded HTML parser with our handler implementation HTMLEditorKit.ParserCallback
13      to extract the text of a html page,
14      also make a list of image references,
15      hrefs, font faces and unknown tags.
16  
17  */
18  public final class HTMLTextExtractor extends HTMLEditorKit.ParserCallback
19  {
20    // the text only
21    private StringBuffer   textOnly = new StringBuffer  ();
22    boolean includePicturesAndReferences = false;
23  
24    private Vector<String  > unknownTags = new Vector<String  >();
25    private Vector<String  > aHrefs = new Vector<String  >();
26    private Vector<String  > images = new Vector<String  >();
27    private Vector<String  > fonts = new Vector<String  >();
28  
29    private Vector<String  > scriptsTagInfos = new Vector<String  >();
30    private Vector<String  > metaTagInfos = new Vector<String  >();
31    private Vector<String  > linkTagInfos = new Vector<String  >();
32  
33    // internal data
34  
35    int newLines = 0;
36    int indent = 0;
37  
38  
39    /** @param includePicturesAndReferences should be false
40       when parsing words for spam, because the pics and refs are put in the
41       words DB with semantics (url*...) and get from this class with getLinks...
42    */
43    public HTMLTextExtractor(String   cont, boolean includePicturesAndReferences) throws Exception  
44    {
45        this.includePicturesAndReferences = includePicturesAndReferences;
46        ParserDelegator pd = new ParserDelegator();
47  
48        StringReader r = new StringReader(cont);
49        pd.parse(r,
50          this,
51          true);   // don't stop when charset changes !!!  almost every pages cause parser crash if false
52  
53    } // Constructor
54  
55    // Parse results
56    //
57  
58    public String   getTextOnly() { return textOnly.toString(); }
59    public Vector<String  > getUnknownTags()  { return unknownTags; }
60  
61    /** the a href references
62    */
63    public Vector<String  > getLinksHREFs()    { return aHrefs; }
64  
65    /** the img src names
66    */
67    public Vector<String  > getImageSrcs()   { return images; }
68    public Vector<String  > getFontFaces()   { return fonts; }
69  
70    public Vector<String  > getScriptTagInfos() { return scriptsTagInfos; }
71    public Vector<String  > getMetaTagInfos() { return metaTagInfos; }
72    public Vector<String  > getLinkTagInfos() { return linkTagInfos; }
73  
74  
75    // Parser's handles...
76    //
77  
78    public void handleText(char[] data, int pos)
79    {
80      if(newLines>0)
81      {
82        // maximum two new lines
83        for(int i=0; i<newLines; i++)
84        {
85           textOnly.append("\r\n");
86           if(i==1) break;
87        }
88  
89        // indentation
90        for(int i=0; i<indent; i++)
91        {
92          textOnly.append(" ");
93        }
94        newLines=0;
95      }
96      textOnly.append(new String  (data));
97    }
98  
99    public void handleComment(char[] data, int pos)
100   {
101     //System.out.println("COMMENT "+new String(data));
102   }
103 
104   public void handleEndOfLineString(String   eol)
105   {
106     // seems to happend only at the end of the file
107     //textOnly.append("EOL");
108   }
109 
110   public void handleError(String   errorMsg, int pos)
111   {
112     // a lot of errors are encountered
113     //textOnly.append(" [Error "+errorMsg+"] ");
114   }
115 
116   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
117   {
118     if(t==HTML.Tag.BR || t==HTML.Tag.P)
119     {
120       //textOnly.append("\r\n");
121       newLines++;
122     }
123     else if(t==HTML.Tag.META)
124     {
125       //System.out.println("META");
126     }
127     else if(t==HTML.Tag.LINK)
128     {
129       //ignore
130     }
131     else if (t==HTML.Tag.HR)
132     {
133       //ignore ### horizontal rule
134     }
135     else if(t==HTML.Tag.IMG)
136     {
137       String   src = (String  ) a.getAttribute(HTML.Attribute.SRC);
138       if(src!=null)
139       {
140         images.add(src);
141         if(includePicturesAndReferences)
142         {
143           textOnly.append(" [image "+src+"] ");
144         }
145       }
146     }
147     else
148     {
149       // unknown tags...
150       if(t.toString().equals("tbody"))
151       {
152         // ignore
153       }
154       else
155       {
156         //System.out.println("Unknown simple Tag "+t);
157         unknownTags.add(""+t);
158       }
159     }
160   }
161 
162 
163 
164   public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
165   {
166     //System.out.println(""+t);
167 
168     if(t==HTML.Tag.P)
169     {
170       newLines++;
171     }
172     else if(t==HTML.Tag.UL || t==HTML.Tag.OL)
173     {
174       indent += 5;
175     }
176     else if(t==HTML.Tag.LI)
177     {
178       newLines++;
179     }
180     else if(t==HTML.Tag.CENTER)
181     {
182       newLines++;
183       indent+=10;
184     }
185     else if(t==HTML.Tag.BLOCKQUOTE)
186     {
187       newLines++;
188       indent+=5;
189     }
190     else if(t==HTML.Tag.SCRIPT)
191     {
192       //System.out.println("SCRIPT ! ");
193     }
194     else if(t==HTML.Tag.TITLE || t==HTML.Tag.H1 || t==HTML.Tag.H2 || t==HTML.Tag.H3
195          || t==HTML.Tag.H4 || t==HTML.Tag.H5 || t==HTML.Tag.H6
196          || t==HTML.Tag.TABLE
197          || t==HTML.Tag.TR    )
198     {
199       newLines++;
200     }
201     else if(t==HTML.Tag.TD)
202     {
203       textOnly.append("\t");
204     }
205     else if(t==HTML.Tag.A)
206     {
207       String   ref = (String  ) a.getAttribute(HTML.Attribute.HREF);
208       if(ref!=null)
209       {
210         this.aHrefs.add(ref);
211         if(includePicturesAndReferences)
212         {
213           textOnly.append(" [link "+ref+"] ");
214         }
215       }
216     }
217     else if(t==HTML.Tag.FONT)
218     {
219       String   face = (String  ) a.getAttribute(HTML.Attribute.FACE);
220       if(face!=null)
221       {
222         this.fonts.add(face);
223       }
224     }
225     else if(t==HTML.Tag.B    || t==HTML.Tag.I || t==HTML.Tag.U
226          || t==HTML.Tag.BODY || t==HTML.Tag.PRE
227          || t==HTML.Tag.HTML || t==HTML.Tag.HEAD
228          || t==HTML.Tag.SUP  || t==HTML.Tag.SUB
229          || t==HTML.Tag.CODE )
230     {
231       // just ignore
232     }
233     else
234     {
235       //System.out.println("Unknown start tag "+t);
236       unknownTags.add(""+t);
237     }
238 
239     //if(t.breaksFlow()) textOnly.append("\r\n");
240   }
241 
242   public void handleEndTag(HTML.Tag t, int pos)
243   {
244     if(t==HTML.Tag.TITLE || t==HTML.Tag.H1 || t==HTML.Tag.H2 || t==HTML.Tag.H3
245        || t==HTML.Tag.H4 || t==HTML.Tag.H5 || t==HTML.Tag.H6)
246     {
247       // new line
248       newLines ++;
249       //textOnly.append("\r\n");
250     }
251     else if(t==HTML.Tag.UL || t==HTML.Tag.OL)
252     {
253       indent -= 5;
254       newLines ++;
255     }
256     else if(t==HTML.Tag.BLOCKQUOTE)
257     {
258       indent -= 5;
259       newLines ++;
260     }
261     else if(t==HTML.Tag.CENTER)
262     {
263       indent -= 10;
264       newLines ++;
265     }
266     else if(t==HTML.Tag.P
267         || t==HTML.Tag.UL || t==HTML.Tag.OL
268         || t==HTML.Tag.PRE || t==HTML.Tag.TABLE    )
269     {
270       newLines ++;
271     }
272     else if(t==HTML.Tag.TR || t==HTML.Tag.TD)
273     {
274       // ignore
275     }
276     else if(t==HTML.Tag.LI || t==HTML.Tag.A || t==HTML.Tag.HEAD
277        || t==HTML.Tag.B || t==HTML.Tag.I || t==HTML.Tag.U
278        || t==HTML.Tag.BODY || t==HTML.Tag.HTML
279        || t==HTML.Tag.FONT || t==HTML.Tag.BASEFONT
280        || t==HTML.Tag.SUP || t==HTML.Tag.SUB
281        || t==HTML.Tag.CODE)
282     {
283       // ignore
284     }
285     else
286     {
287       //System.out.println("Unknown end tag "+t);
288       unknownTags.add(""+t);
289     }
290 
291     //if(t.breaksFlow()) textOnly.append("\r\n");
292   }
293 
294 
295 
296   public static void main(String  [] a)
297   {
298     try
299     {
300       //String cont = new String(FileUtils.getFileContent(new File("c:/data/test.htm")));
301       String   cont = new String  (FileUtils.getFileContent(new File("C:/sources/Schmortopf_IDE/Internet/Schmortopf/versionhistory.htm")));
302       //String cont = new String(FileUtils.getFileContent(new File("c:/proj/test.htm")));
303       HTMLTextExtractor t = new  HTMLTextExtractor(cont, true);
304       System.out.println("\n========== TEXT. ===========\n"+t.getTextOnly());
305     }
306     catch(Exception   e)
307     {
308       e.printStackTrace();
309     }
310 
311   }
312 }
313  // HTMLTextExtractor
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags