1 package SnowMailClient.html; 2 3 import snow.utils.storage.*; 4 import SnowMailClient.utils.StringUtils; 5 6 import java.text.*; 7 import javax.swing.text.*; 8 import javax.swing.text.html.*; 9 import javax.swing.text.html.parser.*; 10 import java.io.*; 11 import java.util.*; 12 13 26 public final class HTMLCleaner extends HTMLEditorKit.ParserCallback 27 { 28 private StringBuffer cleanHTMLText = new StringBuffer (); 30 31 private Object [] authorizedTags = new Object []{ HTML.Tag.HTML, HTML.Tag.HEAD, HTML.Tag.TITLE, HTML.Tag.BODY, 33 HTML.Tag.H1, HTML.Tag.H2, HTML.Tag.H3, HTML.Tag.H4, HTML.Tag.H5, HTML.Tag.H6, 34 HTML.Tag.BR, HTML.Tag.P, HTML.Tag.CENTER, 35 HTML.Tag.PRE, HTML.Tag.TEXTAREA, HTML.Tag.BLOCKQUOTE, 36 HTML.Tag.HR, 37 HTML.Tag.B, HTML.Tag.I, HTML.Tag.U, HTML.Tag.EM, HTML.Tag.SUB, HTML.Tag.SUP, HTML.Tag.STRIKE, HTML.Tag.STRONG, 38 HTML.Tag.UL, HTML.Tag.LI, HTML.Tag.OL, HTML.Tag.DL, HTML.Tag.DT, HTML.Tag.DD, HTML.Tag.TABLE, HTML.Tag.TD, HTML.Tag.TR, HTML.Tag.TH, HTML.Tag.TT, HTML.Tag.A, 43 HTML.Tag.IMG, HTML.Tag.ADDRESS, 45 HTML.Tag.AREA, 46 HTML.Tag.CITE, 47 }; 59 60 private Object [] tagsToIgnoreContent = new Object []{ 61 HTML.Tag.COMMENT, 62 HTML.Tag.SCRIPT, 63 HTML.Tag.OBJECT, 64 HTML.Tag.APPLET 65 }; 66 67 68 private Object [] forbiddenAttributes = new Object [] 69 { 70 "background", "color", "bgcolor", 72 "text", "vlink", "alink", "link" }; 74 75 76 77 int ignoreContentDepth = 0; 79 private TagComparator tagComparator = new TagComparator(); 80 private boolean excludeImages; 81 82 public HTMLCleaner(String cont, boolean excludeImages) throws Exception 83 { 84 this.excludeImages = excludeImages; 85 Arrays.sort(authorizedTags, tagComparator); 87 Arrays.sort(forbiddenAttributes, tagComparator); 88 Arrays.sort(tagsToIgnoreContent, tagComparator); 89 90 91 ParserDelegator pd = new ParserDelegator(); 92 StringReader r = new StringReader(cont); 93 pd.parse(r, 94 this, 95 true); 97 } 99 public static String cleanHTML(String html, boolean excludeImages) throws Exception 100 { 101 HTMLCleaner hc = new HTMLCleaner(html, excludeImages); 102 return hc.getCleanHTMLText(); 103 } 104 105 static class TagComparator implements Comparator<Object > 106 { 107 public int compare(Object o1, Object o2) 108 { 109 String tn1 = ""+o1; String tn2 = ""+o2; 111 112 return tn1.compareTo(tn2); 113 } 114 } 115 116 private boolean isAuthorizedTag(String t) 117 { 118 int pos = Arrays.binarySearch(authorizedTags, t, tagComparator); 119 if(pos<0) return false; 120 return true; 121 } 122 123 private boolean isAuthorizedAttribute(String t) 124 { 125 int pos = Arrays.binarySearch(forbiddenAttributes, t, tagComparator); 126 if(pos>=0) return false; 127 return true; 128 } 129 130 private boolean isIgnoreContentTag(String t) 131 { 132 int pos = Arrays.binarySearch(tagsToIgnoreContent, t, tagComparator); 133 if(pos>=0) return true; 134 return false; 135 } 136 137 140 public String getCleanHTMLText() { return cleanHTMLText.toString(); } 141 142 143 146 public void handleText(char[] data, int pos) 147 { 148 if(ignoreContentDepth>0) return; 149 150 cleanHTMLText.append(new String (data)); 151 } 152 153 public void handleComment(char[] data, int pos) 154 { 155 } 157 158 public void handleEndOfLineString(String eol) 159 { 160 } 163 164 public void handleError(String errorMsg, int pos) 165 { 166 } 169 170 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) 171 { 172 if(t==HTML.Tag.IMG) 173 { 174 if(this.excludeImages) 175 { 176 String src = (String ) a.getAttribute(HTML.Attribute.SRC); 177 if(StringUtils.startsWithIgnoresCaseAndBlanks(src, "cid:")) 178 { 179 this.cleanHTMLText.append(" <img SRC="+src+"> "); 181 } 182 else 183 { 184 this.cleanHTMLText.append(" [image "+src+"] "); 185 } 186 return; 187 } 188 } 189 190 if(this.isAuthorizedTag(t.toString())) 192 { 193 cleanHTMLText.append("\r\n<"+t+""+argumentsToString(a)+">"); 194 } 195 } 196 197 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) 198 { 199 if(isIgnoreContentTag(t.toString())) 200 { 201 ignoreContentDepth++; 202 } 203 else if(this.isAuthorizedTag(t.toString())) 204 { 205 cleanHTMLText.append("\r\n<"+t+""+argumentsToString(a)+">"); 206 } 207 } 208 209 public void handleEndTag(HTML.Tag t, int pos) 210 { 211 if(isIgnoreContentTag(t.toString())) 212 { 213 ignoreContentDepth--; 214 } 215 else if(this.isAuthorizedTag(t.toString())) 216 { 217 cleanHTMLText.append("\r\n</"+t+">"); 218 } 219 } 220 221 222 225 private String argumentsToString(MutableAttributeSet a) 226 { 227 Enumeration names = a.getAttributeNames(); 228 StringBuffer sb = new StringBuffer (); 229 while(names.hasMoreElements()) 230 { 231 Object ato = names.nextElement(); 232 233 if(ato instanceof HTML.Attribute) 234 { 235 HTML.Attribute at = (HTML.Attribute) ato; 236 if(this.isAuthorizedAttribute(at.toString())) 237 { 238 String val = (String ) a.getAttribute(at); 239 240 sb.append(" "); 241 sb.append( at.toString() ); 242 sb.append( "=\"" ); 243 sb.append( val ); 244 sb.append( "\"" ); 245 } 246 else 247 { 248 } 250 } 251 else 252 { 253 } 256 } 257 return sb.toString(); 258 } 259 260 263 public static void main(String [] a) 264 { 265 try 266 { 267 String cont = new String (FileUtils.getFileContent(new File("C:/sources/other/mail/client/www.snowraver.org/java/SnowMail/main.htm"))); 270 HTMLCleaner t = new HTMLCleaner(cont, false); 272 System.out.println("\n========== TEXT ===========\n"+t.getCleanHTMLText()); 273 FileUtils.saveToFile(t.getCleanHTMLText().getBytes(), new File("C:/sources/other/mail/client/www.snowraver.org/java/SnowMail/aaa.htm")); 274 } 275 catch(Exception e) 276 { 277 e.printStackTrace(); 278 } 279 } 280 } | Popular Tags |