KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > SnowMailClient > html > HTMLCleaner


1 package SnowMailClient.html;
2
3 import snow.utils.storage.*;
4 import SnowMailClient.utils.StringUtils;
5
6 import java.text.*;
7 import javax.swing.text.*;
8 import javax.swing.text.html.*;
9 import javax.swing.text.html.parser.*;
10 import java.io.*;
11 import java.util.*;
12
13 /** Clean an HTML source...
14
15     We use HTMLEditorKit.ParserCallback to parse and rewrite the whole html
16
17       0) remove unknown tags
18       1) remove <SCRIPT> ... </SCRIPT>
19       2) remove <LINK ...>
20       3) remove <META ...>
21       4) remove comments <! .. >
22       5) remove some attributes, as Colors, background, ...
23
24       We have so a clear structure of the tables, images and text, black on white.
25 */

26 public final class HTMLCleaner extends HTMLEditorKit.ParserCallback
27 {
28   // the clean HTML text recomposed
29
private StringBuffer JavaDoc cleanHTMLText = new StringBuffer JavaDoc();
30
31   private Object JavaDoc[] authorizedTags = new Object JavaDoc[]{ // Tag OR String !
32
HTML.Tag.HTML, HTML.Tag.HEAD, HTML.Tag.TITLE, HTML.Tag.BODY,
33      HTML.Tag.H1, HTML.Tag.H2, HTML.Tag.H3, HTML.Tag.H4, HTML.Tag.H5, HTML.Tag.H6,
34      HTML.Tag.BR, HTML.Tag.P, HTML.Tag.CENTER,
35      HTML.Tag.PRE, HTML.Tag.TEXTAREA, HTML.Tag.BLOCKQUOTE,
36      HTML.Tag.HR,
37      HTML.Tag.B, HTML.Tag.I, HTML.Tag.U, HTML.Tag.EM, HTML.Tag.SUB, HTML.Tag.SUP, HTML.Tag.STRIKE, HTML.Tag.STRONG,
38      HTML.Tag.UL, HTML.Tag.LI, HTML.Tag.OL, HTML.Tag.DL, HTML.Tag.DT, HTML.Tag.DD, // lists
39
//HTML.Tag.FONT,
40
//HTML.Tag.BASEFONT,
41
HTML.Tag.TABLE, HTML.Tag.TD, HTML.Tag.TR, HTML.Tag.TH, HTML.Tag.TT, // tables
42
HTML.Tag.A,
43      HTML.Tag.IMG, // treated manually in handleSingleTag
44
HTML.Tag.ADDRESS,
45      HTML.Tag.AREA,
46      HTML.Tag.CITE,
47      //HTML.Tag.NOFRAMES, HTML.Tag.FRAME, HTML.Tag.FRAMESET,
48
//HTML.Tag.MAP,
49
//HTML.Tag.OBJECT
50
//HTML.Tag.MENU,
51
//HTML.Tag.META,
52
//HTML.Tag.INPUT,
53
//HTML.Tag.IMPLIED,
54
//HTML.Tag.OPTION,
55
//HTML.Tag.PARAM,
56
//HTML.Tag.APPLET,
57
//HTML.Tag.STYLE
58
};
59
60   private Object JavaDoc[] tagsToIgnoreContent = new Object JavaDoc[]{
61      HTML.Tag.COMMENT,
62      HTML.Tag.SCRIPT,
63      HTML.Tag.OBJECT,
64      HTML.Tag.APPLET
65   };
66
67
68   private Object JavaDoc[] forbiddenAttributes = new Object JavaDoc[]
69   {
70     "background", // background images
71
"color", "bgcolor",
72     "text", "vlink", "alink", "link" // colors defined in the header
73
};
74
75
76
77   int ignoreContentDepth = 0; // when >0, the content is ignored. Start/end Tags iterates this counter
78

79   private TagComparator tagComparator = new TagComparator();
80   private boolean excludeImages;
81
82   public HTMLCleaner(String JavaDoc cont, boolean excludeImages) throws Exception JavaDoc
83   {
84       this.excludeImages = excludeImages;
85       // for quick binary search
86
Arrays.sort(authorizedTags, tagComparator);
87       Arrays.sort(forbiddenAttributes, tagComparator);
88       Arrays.sort(tagsToIgnoreContent, tagComparator);
89
90
91       ParserDelegator pd = new ParserDelegator();
92       StringReader r = new StringReader(cont);
93       pd.parse(r,
94         this,
95         true); // don't stop when charset changes !!! almost every pages cause parser crash if false
96

97   } // Constructor
98

99   public static String JavaDoc cleanHTML(String JavaDoc html, boolean excludeImages) throws Exception JavaDoc
100   {
101     HTMLCleaner hc = new HTMLCleaner(html, excludeImages);
102     return hc.getCleanHTMLText();
103   }
104
105   static class TagComparator implements Comparator<Object JavaDoc>
106   {
107     public int compare(Object JavaDoc o1, Object JavaDoc o2)
108     {
109        String JavaDoc tn1 = ""+o1; // instance of HTML.Tag or String
110
String JavaDoc tn2 = ""+o2;
111
112        return tn1.compareTo(tn2);
113     }
114   }
115
116   private boolean isAuthorizedTag(String JavaDoc t)
117   {
118      int pos = Arrays.binarySearch(authorizedTags, t, tagComparator);
119      if(pos<0) return false;
120      return true;
121   }
122
123   private boolean isAuthorizedAttribute(String JavaDoc t)
124   {
125      int pos = Arrays.binarySearch(forbiddenAttributes, t, tagComparator);
126      if(pos>=0) return false;
127      return true;
128   }
129
130   private boolean isIgnoreContentTag(String JavaDoc t)
131   {
132      int pos = Arrays.binarySearch(tagsToIgnoreContent, t, tagComparator);
133      if(pos>=0) return true;
134      return false;
135   }
136
137   // Parse results
138
//
139

140   public String JavaDoc getCleanHTMLText() { return cleanHTMLText.toString(); }
141
142
143   // Parser's handles...
144
//
145

146   public void handleText(char[] data, int pos)
147   {
148     if(ignoreContentDepth>0) return;
149
150     cleanHTMLText.append(new String JavaDoc(data));
151   }
152
153   public void handleComment(char[] data, int pos)
154   {
155     //System.out.println("COMMENT "+new String(data));
156
}
157
158   public void handleEndOfLineString(String JavaDoc eol)
159   {
160     // seems to happend only at the end of the file
161
//textOnly.append("EOL");
162
}
163
164   public void handleError(String JavaDoc errorMsg, int pos)
165   {
166     // a lot of errors are encountered
167
//textOnly.append(" [Error "+errorMsg+"] ");
168
}
169
170   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
171   {
172     if(t==HTML.Tag.IMG)
173     {
174       if(this.excludeImages)
175       {
176         String JavaDoc src = (String JavaDoc) a.getAttribute(HTML.Attribute.SRC);
177         if(StringUtils.startsWithIgnoresCaseAndBlanks(src, "cid:"))
178         {
179           // allow embedded images
180
this.cleanHTMLText.append(" <img SRC="+src+"> ");
181         }
182         else
183         {
184           this.cleanHTMLText.append(" [image "+src+"] ");
185         }
186         return;
187       }
188     }
189
190     //System.out.println(""+t);
191
if(this.isAuthorizedTag(t.toString()))
192     {
193       cleanHTMLText.append("\r\n<"+t+""+argumentsToString(a)+">");
194     }
195   }
196
197   public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
198   {
199     if(isIgnoreContentTag(t.toString()))
200     {
201       ignoreContentDepth++;
202     }
203     else if(this.isAuthorizedTag(t.toString()))
204     {
205       cleanHTMLText.append("\r\n<"+t+""+argumentsToString(a)+">");
206     }
207   }
208
209   public void handleEndTag(HTML.Tag t, int pos)
210   {
211     if(isIgnoreContentTag(t.toString()))
212     {
213       ignoreContentDepth--;
214     }
215     else if(this.isAuthorizedTag(t.toString()))
216     {
217       cleanHTMLText.append("\r\n</"+t+">");
218     }
219   }
220
221
222   // Utils
223
//
224

225   private String JavaDoc argumentsToString(MutableAttributeSet a)
226   {
227     Enumeration names = a.getAttributeNames();
228     StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
229     while(names.hasMoreElements())
230     {
231       Object JavaDoc ato = names.nextElement();
232
233       if(ato instanceof HTML.Attribute)
234       {
235          HTML.Attribute at = (HTML.Attribute) ato;
236          if(this.isAuthorizedAttribute(at.toString()))
237          {
238            String JavaDoc val = (String JavaDoc) a.getAttribute(at);
239
240            sb.append(" ");
241            sb.append( at.toString() );
242            sb.append( "=\"" );
243            sb.append( val );
244            sb.append( "\"" );
245          }
246          else
247          {
248            //System.out.println("not accepted attr "+at.toString());
249
}
250       }
251       else
252       {
253          // ###
254
//System.out.println("??? "+ato+" "+(String) a.getAttribute(ato));
255
}
256     }
257     return sb.toString();
258   }
259
260   // Test
261
//
262

263   public static void main(String JavaDoc[] a)
264   {
265     try
266     {
267       //String cont = new String(FileUtils.getFileContent(new File("c:/data/test.htm")));
268
//String cont = new String(FileUtils.getFileContent(new File("C:/sources/Schmortopf_IDE/Internet/Schmortopf/versionhistory.htm")));
269
String JavaDoc cont = new String JavaDoc(FileUtils.getFileContent(new File("C:/sources/other/mail/client/www.snowraver.org/java/SnowMail/main.htm")));
270       //String cont = new String(FileUtils.getFileContent(new File("c:/proj/test.htm")));
271
HTMLCleaner t = new HTMLCleaner(cont, false);
272       System.out.println("\n========== TEXT ===========\n"+t.getCleanHTMLText());
273       FileUtils.saveToFile(t.getCleanHTMLText().getBytes(), new File("C:/sources/other/mail/client/www.snowraver.org/java/SnowMail/aaa.htm"));
274     }
275     catch(Exception JavaDoc e)
276     {
277       e.printStackTrace();
278     }
279   }
280 }
Popular Tags