1 package SnowMailClient.html; 2 3 import snow.utils.storage.*; 4 5 import java.text.*; 6 import javax.swing.text.*; 7 import javax.swing.text.html.*; 8 import javax.swing.text.html.parser.*; 9 import java.io.*; 10 import java.util.*; 11 12 18 public final class HTMLTextExtractor extends HTMLEditorKit.ParserCallback 19 { 20 private StringBuffer textOnly = new StringBuffer (); 22 boolean includePicturesAndReferences = false; 23 24 private Vector<String > unknownTags = new Vector<String >(); 25 private Vector<String > aHrefs = new Vector<String >(); 26 private Vector<String > images = new Vector<String >(); 27 private Vector<String > fonts = new Vector<String >(); 28 29 private Vector<String > scriptsTagInfos = new Vector<String >(); 30 private Vector<String > metaTagInfos = new Vector<String >(); 31 private Vector<String > linkTagInfos = new Vector<String >(); 32 33 35 int newLines = 0; 36 int indent = 0; 37 38 39 43 public HTMLTextExtractor(String cont, boolean includePicturesAndReferences) throws Exception 44 { 45 this.includePicturesAndReferences = includePicturesAndReferences; 46 ParserDelegator pd = new ParserDelegator(); 47 48 StringReader r = new StringReader(cont); 49 pd.parse(r, 50 this, 51 true); 53 } 55 58 public String getTextOnly() { return textOnly.toString(); } 59 public Vector<String > getUnknownTags() { return unknownTags; } 60 61 63 public Vector<String > getLinksHREFs() { return aHrefs; } 64 65 67 public Vector<String > getImageSrcs() { return images; } 68 public Vector<String > getFontFaces() { return fonts; } 69 70 public Vector<String > getScriptTagInfos() { return scriptsTagInfos; } 71 public Vector<String > getMetaTagInfos() { return metaTagInfos; } 72 public Vector<String > getLinkTagInfos() { return linkTagInfos; } 73 74 75 78 public void handleText(char[] data, int pos) 79 { 80 if(newLines>0) 81 { 82 for(int i=0; i<newLines; i++) 84 { 85 textOnly.append("\r\n"); 86 if(i==1) break; 87 } 88 89 for(int i=0; i<indent; i++) 91 { 92 textOnly.append(" "); 93 } 94 newLines=0; 95 } 96 textOnly.append(new String (data)); 97 } 98 99 public void handleComment(char[] data, int pos) 100 { 101 } 103 104 public void handleEndOfLineString(String eol) 105 { 106 } 109 110 public void handleError(String errorMsg, int pos) 111 { 112 } 115 116 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) 117 { 118 if(t==HTML.Tag.BR || t==HTML.Tag.P) 119 { 120 newLines++; 122 } 123 else if(t==HTML.Tag.META) 124 { 125 } 127 else if(t==HTML.Tag.LINK) 128 { 129 } 131 else if (t==HTML.Tag.HR) 132 { 133 } 135 else if(t==HTML.Tag.IMG) 136 { 137 String src = (String ) a.getAttribute(HTML.Attribute.SRC); 138 if(src!=null) 139 { 140 images.add(src); 141 if(includePicturesAndReferences) 142 { 143 textOnly.append(" [image "+src+"] "); 144 } 145 } 146 } 147 else 148 { 149 if(t.toString().equals("tbody")) 151 { 152 } 154 else 155 { 156 unknownTags.add(""+t); 158 } 159 } 160 } 161 162 163 164 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) 165 { 166 168 if(t==HTML.Tag.P) 169 { 170 newLines++; 171 } 172 else if(t==HTML.Tag.UL || t==HTML.Tag.OL) 173 { 174 indent += 5; 175 } 176 else if(t==HTML.Tag.LI) 177 { 178 newLines++; 179 } 180 else if(t==HTML.Tag.CENTER) 181 { 182 newLines++; 183 indent+=10; 184 } 185 else if(t==HTML.Tag.BLOCKQUOTE) 186 { 187 newLines++; 188 indent+=5; 189 } 190 else if(t==HTML.Tag.SCRIPT) 191 { 192 } 194 else if(t==HTML.Tag.TITLE || t==HTML.Tag.H1 || t==HTML.Tag.H2 || t==HTML.Tag.H3 195 || t==HTML.Tag.H4 || t==HTML.Tag.H5 || t==HTML.Tag.H6 196 || t==HTML.Tag.TABLE 197 || t==HTML.Tag.TR ) 198 { 199 newLines++; 200 } 201 else if(t==HTML.Tag.TD) 202 { 203 textOnly.append("\t"); 204 } 205 else if(t==HTML.Tag.A) 206 { 207 String ref = (String ) a.getAttribute(HTML.Attribute.HREF); 208 if(ref!=null) 209 { 210 this.aHrefs.add(ref); 211 if(includePicturesAndReferences) 212 { 213 textOnly.append(" [link "+ref+"] "); 214 } 215 } 216 } 217 else if(t==HTML.Tag.FONT) 218 { 219 String face = (String ) a.getAttribute(HTML.Attribute.FACE); 220 if(face!=null) 221 { 222 this.fonts.add(face); 223 } 224 } 225 else if(t==HTML.Tag.B || t==HTML.Tag.I || t==HTML.Tag.U 226 || t==HTML.Tag.BODY || t==HTML.Tag.PRE 227 || t==HTML.Tag.HTML || t==HTML.Tag.HEAD 228 || t==HTML.Tag.SUP || t==HTML.Tag.SUB 229 || t==HTML.Tag.CODE ) 230 { 231 } 233 else 234 { 235 unknownTags.add(""+t); 237 } 238 239 } 241 242 public void handleEndTag(HTML.Tag t, int pos) 243 { 244 if(t==HTML.Tag.TITLE || t==HTML.Tag.H1 || t==HTML.Tag.H2 || t==HTML.Tag.H3 245 || t==HTML.Tag.H4 || t==HTML.Tag.H5 || t==HTML.Tag.H6) 246 { 247 newLines ++; 249 } 251 else if(t==HTML.Tag.UL || t==HTML.Tag.OL) 252 { 253 indent -= 5; 254 newLines ++; 255 } 256 else if(t==HTML.Tag.BLOCKQUOTE) 257 { 258 indent -= 5; 259 newLines ++; 260 } 261 else if(t==HTML.Tag.CENTER) 262 { 263 indent -= 10; 264 newLines ++; 265 } 266 else if(t==HTML.Tag.P 267 || t==HTML.Tag.UL || t==HTML.Tag.OL 268 || t==HTML.Tag.PRE || t==HTML.Tag.TABLE ) 269 { 270 newLines ++; 271 } 272 else if(t==HTML.Tag.TR || t==HTML.Tag.TD) 273 { 274 } 276 else if(t==HTML.Tag.LI || t==HTML.Tag.A || t==HTML.Tag.HEAD 277 || t==HTML.Tag.B || t==HTML.Tag.I || t==HTML.Tag.U 278 || t==HTML.Tag.BODY || t==HTML.Tag.HTML 279 || t==HTML.Tag.FONT || t==HTML.Tag.BASEFONT 280 || t==HTML.Tag.SUP || t==HTML.Tag.SUB 281 || t==HTML.Tag.CODE) 282 { 283 } 285 else 286 { 287 unknownTags.add(""+t); 289 } 290 291 } 293 294 295 296 public static void main(String [] a) 297 { 298 try 299 { 300 String cont = new String (FileUtils.getFileContent(new File("C:/sources/Schmortopf_IDE/Internet/Schmortopf/versionhistory.htm"))); 302 HTMLTextExtractor t = new HTMLTextExtractor(cont, true); 304 System.out.println("\n========== TEXT. ===========\n"+t.getTextOnly()); 305 } 306 catch(Exception e) 307 { 308 e.printStackTrace(); 309 } 310 311 } 312 } 313 | Popular Tags |