1 8 9 package org.enhydra.oyster.util; 10 11 import org.enhydra.oyster.exception.SMIMEException; 12 import java.util.Vector ; 13 import java.net.MalformedURLException ; 14 import java.net.URL ; 15 import java.io.InputStream ; 16 import java.io.ByteArrayOutputStream ; 17 import java.io.File ; 18 import org.w3c.dom.Attr ; 19 import org.w3c.dom.Document ; 20 import org.w3c.dom.NamedNodeMap ; 21 import org.w3c.dom.Node ; 22 import org.w3c.dom.NodeList ; 23 import org.w3c.tidy.Tidy; 24 25 26 42 public class HtmlAnalyzer { 43 44 47 private String plainText = ""; 48 49 52 private boolean pTagEnable = true; 53 54 60 private String absolutPath = null; 61 62 66 private Document doc; 67 68 72 private int indent = 0; 73 74 78 private int olNumber = 1; 79 80 85 private String ul_ol = ""; 86 87 91 private final String indentString = 92 "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"; 93 97 private Vector sourceLinks = new Vector (0,1); 98 99 104 private boolean enableSwapping = true; 105 106 120 public HtmlAnalyzer(InputStream content0, String externalPlainText0) throws SMIMEException 121 { 122 Tidy tidy = new Tidy(); 123 tidy.setWraplen(1000); 124 tidy.setShowWarnings(false); 125 tidy.setUpperCaseTags(true); 126 doc = (tidy.parseDOM(content0, null)); 127 enableSwapping = false; 128 if(externalPlainText0 == null) { 129 analyze(doc); 130 plainText = plainText + "\r\n"; 131 } 132 else { 133 analyzeLight(doc); 134 plainText = new String (externalPlainText0); 135 plainText = plainText + "\r\n"; 136 } 137 } 138 139 140 158 public HtmlAnalyzer(InputStream content0, String path0, String externalPlainText0) 159 throws SMIMEException 160 { 161 if(path0 != null) { 162 absolutPath = new String (path0); 163 if(absolutPath.charAt( absolutPath.length()-1) == '\\' || 164 absolutPath.charAt( absolutPath.length()-1) == '/' ) 165 absolutPath = absolutPath.substring(0,absolutPath.length()-1); 166 167 absolutPath = absolutPath.replace('/', File.separatorChar); 168 absolutPath = absolutPath.replace('\\', File.separatorChar) + File.separator; 169 } 170 171 Tidy tidy = new Tidy(); 172 tidy.setWraplen(1000); 173 tidy.setShowWarnings(false); 174 tidy.setUpperCaseTags(true); 175 doc = (tidy.parseDOM(content0, null)); 176 if(externalPlainText0 == null) { 177 analyze(doc); 178 plainText = plainText + "\r\n"; 179 } 180 else { 181 analyzeLight(doc); 182 plainText = new String (externalPlainText0); 183 plainText = plainText + "\r\n"; 184 } 185 } 186 187 194 public Vector getSwappedAdresses() { 195 return sourceLinks; 196 } 197 198 202 public String getPlainText() { 203 return plainText; 204 } 205 206 217 public String getHtmlText() throws SMIMEException { 218 String returnString; 219 220 Tidy tidy = new Tidy(); 221 tidy.setWraplen(1000); 222 ByteArrayOutputStream out = new ByteArrayOutputStream (); 223 224 tidy.pprint(doc,out); 225 226 try { 227 returnString = out.toString("ISO-8859-1"); 228 out.close(); 229 } 230 catch(Exception e) { 231 throw SMIMEException.getInstance(this, e, "getHtmlText" ); 232 } 233 234 return returnString; 235 } 236 237 238 247 private void analyze(Node node0) throws SMIMEException { 248 249 if ( node0 == null ) { 250 return; 251 } 252 String brLine = "\r\n"; 253 int type = node0.getNodeType(); 254 255 boolean pTagEnable_old = true; 256 int indent_old = 0; 257 int olNumber_old = 1; 258 String ul_ol_old = ""; 259 260 switch (type) { 261 case Node.DOCUMENT_NODE: analyze(((Document )node0).getDocumentElement()); 263 break; 264 265 case Node.ELEMENT_NODE: String elName = node0.getNodeName(); 267 if (elName.equalsIgnoreCase("br")) { 268 plainText = plainText + brLine; 269 if(indent > 0) 270 plainText = plainText + 271 indentString.substring(0,indent-1); 272 } 273 else if(elName.equalsIgnoreCase("hr")) { 274 plainText = plainText + brLine + 275 "==================================================" + 276 brLine; 277 } 278 else if(elName.equalsIgnoreCase("p")) { 279 if(pTagEnable) { 280 plainText = plainText + brLine + brLine; 281 if(indent > 0) 282 plainText = plainText + 283 indentString.substring(0,indent-1); 284 } 285 pTagEnable = true; 286 } 287 else if(elName.equalsIgnoreCase("ul")) { 288 pTagEnable_old = pTagEnable; 289 pTagEnable = false; 290 ul_ol_old = ul_ol; 291 ul_ol = elName; 292 indent_old = indent; 293 indent++; 294 } 295 else if(elName.equalsIgnoreCase("ol")) { 296 pTagEnable_old = pTagEnable; 297 pTagEnable = false; 298 ul_ol_old = ul_ol; 299 ul_ol = elName; 300 indent_old = indent; 301 indent++; 302 olNumber_old = olNumber; 303 } 304 else if(elName.equalsIgnoreCase("li")) { 305 pTagEnable = false; 306 if (ul_ol.equalsIgnoreCase("ul")) { 307 plainText = plainText + brLine + 308 indentString.substring(0,indent-1) + 309 ">> "; 310 } 311 else if (ul_ol.equalsIgnoreCase("ol")) { 312 plainText = plainText + brLine + 313 indentString.substring(0,indent-1) + 314 olNumber + ". "; 315 olNumber++; 316 } 317 } 318 else if(elName.equalsIgnoreCase("blockquote")) { 319 pTagEnable_old = pTagEnable; 320 pTagEnable = false; 321 indent_old = indent; 322 indent++; 323 plainText = plainText + brLine + 324 indentString.substring(0,indent); 325 } 326 else if(elName.equalsIgnoreCase("q")) { 327 pTagEnable_old = pTagEnable; 328 pTagEnable = false; 329 plainText = plainText + "\""; 330 } 331 else if(elName.equalsIgnoreCase("table")) { 332 plainText = plainText + brLine + 333 "**************************************************" + brLine + 334 "--------------------------------------------------" + brLine + 335 "-- -- -- -- -- -- -- -- -- -- -- -- --" + brLine; 336 } 337 else if(elName.equalsIgnoreCase("tr")) { 338 plainText = plainText + brLine; 339 } 340 else if(elName.equalsIgnoreCase("td")) { 341 plainText = plainText + brLine; 342 } 343 NamedNodeMap attrs = node0.getAttributes(); 345 for ( int i = 0; i < attrs.getLength(); i++ ) { 346 attrs.item(i).getNodeName().toUpperCase(); 347 if( enableSwapping && 348 ( (attrs.item(i).getNodeName()).equalsIgnoreCase("src") || 349 (attrs.item(i).getNodeName()).equalsIgnoreCase("background")) ) { 350 String resource = attrs.item(i).getNodeValue(); 351 String cid = null; 352 if (resource.substring(0,5).equalsIgnoreCase("*****")) { 354 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) { 355 if (sourceLinks.elementAt(j) instanceof String && 356 ((String )sourceLinks.elementAt(j)).equals(resource) ) 357 cid = (String )sourceLinks.elementAt(j+1); 358 } 359 if(cid == null) { 360 cid = MimeAssist.generateID(); 361 sourceLinks.add(resource); 362 sourceLinks.add(cid); 363 } 364 attrs.item(i).setNodeValue("cid:"+cid); 365 } 366 else { 367 File fRes = existenceOfResource(resource); 368 if (fRes!=null) { 369 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) { 370 if (sourceLinks.elementAt(j) instanceof File && 371 ((File )sourceLinks.elementAt(j)).compareTo(fRes) == 0 ) 372 cid = (String )sourceLinks.elementAt(j+1); 373 } 374 if(cid == null) { 375 cid = MimeAssist.generateID(); 376 sourceLinks.add(fRes); 377 sourceLinks.add(cid); 378 } 379 attrs.item(i).setNodeValue("cid:"+cid); 380 } 381 } 382 } 383 } 384 NodeList children = node0.getChildNodes(); if ( children != null ) { 387 int len = children.getLength(); 388 for ( int i = 0; i < len; i++ ) { 389 analyze(children.item(i)); 390 } 391 } 392 if(elName.equalsIgnoreCase("ul")) { 394 pTagEnable = pTagEnable_old; 395 ul_ol = ul_ol_old; 396 indent = indent_old; 397 } 398 else if(elName.equalsIgnoreCase("ol")) { 399 pTagEnable = pTagEnable_old; 400 ul_ol = ul_ol_old; 401 indent = indent_old; 402 olNumber = olNumber_old; 403 } 404 else if(elName.equalsIgnoreCase("table")) { 405 plainText = plainText + brLine + 406 "**************************************************"; 407 } 408 else if(elName.equalsIgnoreCase("tr")) { 409 plainText = plainText + brLine + 410 "--------------------------------------------------"; 411 } 412 else if(elName.equalsIgnoreCase("td")) { 413 plainText = plainText + brLine + 414 "-- -- -- -- -- -- -- -- -- -- -- -- --"; 415 } 416 else if(elName.equalsIgnoreCase("blockquote")) { 417 indent = indent_old; 418 pTagEnable = pTagEnable_old; 419 } 420 else if(elName.equalsIgnoreCase("q")) { 421 plainText = plainText + "\""; 422 pTagEnable = pTagEnable_old; 423 } 424 425 break; 426 427 case Node.TEXT_NODE: 428 String nodeVal = node0.getNodeValue(); 429 plainText = plainText + nodeVal; 430 break; 431 } 432 433 } 434 435 444 private void analyzeLight(Node node0) throws SMIMEException { 445 446 if ( node0 == null ) { 447 return; 448 } 449 int type = node0.getNodeType(); 450 451 switch (type) { 452 case Node.DOCUMENT_NODE: analyzeLight(((Document )node0).getDocumentElement()); 454 break; 455 456 case Node.ELEMENT_NODE: 458 NamedNodeMap attrs = node0.getAttributes(); 460 for ( int i = 0; i < attrs.getLength(); i++ ) { 461 attrs.item(i).getNodeName().toUpperCase(); 462 if( enableSwapping && 463 ( (attrs.item(i).getNodeName()).equalsIgnoreCase("src") || 464 (attrs.item(i).getNodeName()).equalsIgnoreCase("background")) ) { 465 String resource = attrs.item(i).getNodeValue(); 466 String cid = null; 467 if (resource.substring(0,5).equalsIgnoreCase("*****")) { 469 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) { 470 if (sourceLinks.elementAt(j) instanceof String && 471 ((String )sourceLinks.elementAt(j)).equals(resource) ) 472 cid = (String )sourceLinks.elementAt(j+1); 473 } 474 if(cid == null) { 475 cid = MimeAssist.generateID(); 476 sourceLinks.add(resource); 477 sourceLinks.add(cid); 478 } 479 attrs.item(i).setNodeValue("cid:"+cid); 480 } 481 else { 482 File fRes = existenceOfResource(resource); 483 if (fRes!=null) { 484 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) { 485 if (sourceLinks.elementAt(j) instanceof File && 486 ((File )sourceLinks.elementAt(j)).compareTo(fRes) == 0 ) 487 cid = (String )sourceLinks.elementAt(j+1); 488 } 489 if(cid == null) { 490 cid = MimeAssist.generateID(); 491 sourceLinks.add(fRes); 492 sourceLinks.add(cid); 493 } 494 attrs.item(i).setNodeValue("cid:"+cid); 495 } 496 } 497 } 498 } 499 NodeList children = node0.getChildNodes(); if ( children != null ) { 502 int len = children.getLength(); 503 for ( int i = 0; i < len; i++ ) { 504 analyzeLight(children.item(i)); 505 } 506 } 507 break; 508 509 case Node.TEXT_NODE: 510 break; 511 } 512 513 } 514 515 516 517 525 private File existenceOfResource(String resource0) throws SMIMEException { 526 527 boolean resourceIsUrl = true; 528 String resource = new String (resource0); 529 URL url = null; 530 531 try { 532 url = new URL (resource0); 533 } 534 catch(MalformedURLException e) { 535 resourceIsUrl = false; 536 } 537 538 if( resourceIsUrl == true && (!url.getProtocol().equalsIgnoreCase("file")) ) 539 return null; 540 else if( resourceIsUrl == true && url.getProtocol().equalsIgnoreCase("file") ) { 541 resource = url.getFile(); 542 } 543 544 resource= replaceHex(resource); 545 resource = resource.replace('/', File.separatorChar); 546 resource = resource.replace('\\', File.separatorChar); 547 File fRes = new File (resource); 548 549 try { 550 if(fRes.exists()) 551 return fRes.getAbsoluteFile().getCanonicalFile(); 552 553 fRes = new File (absolutPath + resource); 554 if(fRes.exists()) 555 return fRes.getAbsoluteFile().getCanonicalFile(); 556 557 fRes = new File (absolutPath + resource); 558 if(fRes.exists()) 559 return fRes.getAbsoluteFile().getCanonicalFile(); 560 } 561 catch(Exception e) { 562 throw SMIMEException.getInstance(this, e, "existenceOfResource"); 563 } 564 565 return null; 566 } 567 568 575 private String replaceHex(String resources0) { 576 while(resources0.indexOf("%20")!=-1) { 577 resources0 = resources0.substring(0, resources0.indexOf("%20")) + " " + 578 resources0.substring(resources0.indexOf("%20")+3); 579 } 580 return resources0; 581 } 582 583 584 } | Popular Tags |