1 package org.columba.mail.parser.text; 17 18 import java.io.BufferedReader ; 19 import java.io.StringReader ; 20 import java.nio.ByteBuffer ; 21 import java.nio.charset.Charset ; 22 import java.util.logging.Logger ; 23 import java.util.regex.Matcher ; 24 import java.util.regex.Pattern ; 25 26 27 36 public final class HtmlParser { 37 38 41 private HtmlParser() { 42 } 43 44 private static final Logger LOG = Logger.getLogger("org.columba.mail.parser.text"); 45 46 private static final Pattern BREAK_TO_NL_PATTERN = Pattern.compile("</?br>", 47 Pattern.CASE_INSENSITIVE); 48 private static final Pattern P_TO_DOUBLE_NL_PATTERN = Pattern.compile("</p>", 49 Pattern.CASE_INSENSITIVE); 50 private static final Pattern DIV_TO_DOUBLE_NL_PATTERN = Pattern.compile("</div>", 51 Pattern.CASE_INSENSITIVE); 52 private static final Pattern H_TO_DOUBLE_NL_PATTERN = Pattern.compile("</h\\d>", 53 Pattern.CASE_INSENSITIVE); 54 private static final Pattern WHITE_SPACE_REMOVAL_PATTERN = Pattern.compile("\\s+", 55 Pattern.CASE_INSENSITIVE); 56 private static final Pattern TRIM_SPACE_PATTERN = Pattern.compile("\n\\s+", 57 Pattern.CASE_INSENSITIVE); 58 private static final Pattern HEADER_REMOVAL_PATTERN = Pattern.compile("<html[^<]*<body[^>]*>", 59 Pattern.CASE_INSENSITIVE); 60 private static final Pattern STRIP_TAGS_PATTERN = Pattern.compile("<[^>]*>", 61 Pattern.CASE_INSENSITIVE); 62 private static final Pattern COMMENTS_REMOVAL_PATTERN = Pattern.compile("<!--[^-]*-->", 63 Pattern.CASE_INSENSITIVE); 64 private static final String EMAIL_STR = "([a-zA-Z0-9_+\\.-]+@([a-zA-Z0-9]+([\\.-][a-zA-Z0-9]+)*)+\\.[a-zA-Z]{2,4})"; 65 private static final Pattern EMAIL_PATTERN = Pattern.compile(EMAIL_STR); 67 private static final Pattern EMAIL_PATTERN_INC_LINK = Pattern.compile( 68 "<a[\\s\\n]*href=(\\\")?(mailto:)" + 69 EMAIL_STR 70 + "[^<]*</a>", 71 Pattern.CASE_INSENSITIVE); 72 73 private static final String PROT = "(http|https|ftp)"; 74 private static final String PUNC = ".,:;?!\\-"; 75 private static final String ANY = "\\S"; 76 private static final String URL_STR = "\\b" + "(" + "(\\w*(:\\S*)?@)?" + PROT 77 + "://" + "[" + ANY + "]+" + ")" + "\\b"; 78 79 88 private static final Pattern URL_PATTERN = Pattern.compile(URL_STR, 89 Pattern.CASE_INSENSITIVE); 90 private static final String URL_REPAIR_STR = "(.*://.*?)" + "(" + "(>).*|" 91 + "([" + PUNC + "]*)" + "(<br>)?" + ")$"; 92 93 103 private static final Pattern URL_REPAIR_PATTERN = Pattern.compile(URL_REPAIR_STR); 104 private static final Pattern URL_PATTERN_INC_LINK = Pattern.compile( 105 "<a( |\\n)*?href=(\\\")?" + URL_STR + "(.|\\n)*?</a>", 106 Pattern.CASE_INSENSITIVE); 107 108 110 111 private static final String [] SPECIAL_ENTITIES = { 113 """, "&", "<", ">", 114 " ","¡","¢","£","¤","¥","¦","§", 115 "¨","©","ª","«","¬","­","®","¯", 116 "°","±","²","³","´","µ","¶","·", 117 "¸","¹","º","»","¼","½","¾","¿", 118 "À","Á","Â","Ã","Ä","Å","Æ","Ç", 119 "È","É","Ê","Ë","Ì","Í","Î","Ï", 120 "Ð","Ñ","Ò","Ó","Ô","Õ","Ö","×", 121 "Ø","Ù","Ú","Û","Ü","Ý","Þ","ß", 122 "à","á","â","ã","ä","å","æ","ç", 123 "è","é","ê","ë","ì","í","î","ï", 124 "ð","ñ","ò","ó","ô","õ","ö","÷", 125 "ø","ù","ú","û","ü","ý","þ","ÿ" }; 126 127 128 private static final String [] ENTITY_STRINGS = { 129 "\"", "&", "<", ">", 130 "\u00a0","\u00a1","\u00a2","\u00a3","\u00a4","\u00a5","\u00a6","\u00a7", 131 "\u00a8","\u00a9","\u00aa","\u00ab","\u00ac","\u00ad","\u00ae","\u00af", 132 "\u00b0","\u00b1","\u00b2","\u00b3","\u00b4","\u00b5","\u00b6","\u00b7", 133 "\u00b8","\u00b9","\u00ba","\u00bb","\u00bc","\u00bd","\u00be","\u00bf", 134 "\u00c0","\u00c1","\u00c2","\u00c3","\u00c4","\u00c5","\u00c6","\u00c7", 135 "\u00c8","\u00c9","\u00ca","\u00cb","\u00cc","\u00cd","\u00ce","\u00cf", 136 "\u00d0","\u00d1","\u00d2","\u00d3","\u00d4","\u00d5","\u00d6","\u00d7", 137 "\u00d8","\u00d9","\u00da","\u00db","\u00dc","\u00dd","\u00de","\u00df", 138 "\u00e0","\u00e1","\u00e2","\u00e3","\u00e4","\u00e5","\u00e6","\u00e7", 139 "\u00e8","\u00e9","\u00ea","\u00eb","\u00ec","\u00ed","\u00ee","\u00ef", 140 "\u00f0","\u00f1","\u00f2","\u00f3","\u00f4","\u00f5","\u00f6","\u00f7", 141 "\u00f8","\u00f9","\u00fa","\u00fb","\u00fc","\u00fd","\u00fe","\u00ff" 142 }; 143 144 private static final Pattern SPECIAL_PATTERN = Pattern.compile("&#(\\d+);"); 145 146 private static final Pattern CHARSET_PATTERN=Pattern.compile("\\bcharset=([\\w-_\\d]+)\\b"); 147 148 149 150 173 public static String stripHtmlTags(String s) { 174 if (s == null) { 176 return null; 177 } 178 179 s = HEADER_REMOVAL_PATTERN.matcher(s).replaceAll(""); 181 182 s = WHITE_SPACE_REMOVAL_PATTERN.matcher(s).replaceAll(" "); 184 185 s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n"); 187 s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n"); 188 s = DIV_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n"); 189 s = H_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n"); 190 191 s = STRIP_TAGS_PATTERN.matcher(s).replaceAll(""); 193 194 s = TRIM_SPACE_PATTERN.matcher(s).replaceAll("\n").trim(); 196 197 return s; 198 } 199 200 215 public static String stripHtmlTags(String s, boolean breakToNl) { 216 if (s == null) { 218 return null; 219 } 220 221 if (breakToNl) { 222 s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n"); 224 225 s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n"); 227 } 228 229 s = STRIP_TAGS_PATTERN.matcher(s).replaceAll(""); 231 232 return s; 233 } 234 235 251 public static String restoreSpecialCharacters(Charset charset, String s) { 252 253 for( int i=0; i<SPECIAL_ENTITIES.length; i++) { 255 s = s.replaceAll(SPECIAL_ENTITIES[i],ENTITY_STRINGS[i]); 256 } 257 258 StringBuffer result = new StringBuffer (s.length()); 259 260 Matcher matcher = SPECIAL_PATTERN.matcher(s); 262 while( matcher.find()) { 263 matcher.appendReplacement(result, charset.decode( ByteBuffer.wrap(new byte[]{ (byte) Integer.parseInt(matcher.group(1))})).toString()); 264 } 265 matcher.appendTail(result); 266 267 return result.toString().replaceAll(" ","\t"); 269 } 270 271 public static Charset getHtmlCharset(String htmlSource) { 272 Matcher matcher = CHARSET_PATTERN.matcher(htmlSource); 273 if( matcher.find() ) { 274 try { 275 return Charset.forName(matcher.group(1)); 276 } catch (RuntimeException e) { 277 } 278 } 279 280 return Charset.forName(System.getProperty("file.encoding")); 281 } 282 283 295 public static String htmlToText(String html) { 296 Charset charset = getHtmlCharset(html); 298 299 String text = stripHtmlTags(html); 300 301 return restoreSpecialCharacters(charset, text); 302 } 303 328 public static String textToHtml(String text, String title, String css, String charset) { 329 String html = HtmlParser.substituteSpecialCharacters(text); 331 332 336 StringBuffer buf = new StringBuffer (); 338 buf.append("<html><head>"); 339 buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=" + charset +"\">"); 340 341 if (title != null) { 342 buf.append("<title>"); 343 buf.append(title); 344 buf.append("</title>"); 345 } 346 347 if (css != null) { 348 buf.append("<style type=\"text/css\"><!-- "); 349 buf.append(css); 350 buf.append(" --></style>"); 351 } 352 353 buf.append("</head><body><p>"); 354 buf.append(html); 355 buf.append("</p></body></html>"); 356 357 return buf.toString(); 358 } 359 360 369 public static String substituteSpecialCharacters(String s) { 370 StringBuffer sb = new StringBuffer (s.length()); 371 StringReader sr = new StringReader (s); 372 BufferedReader br = new BufferedReader (sr); 373 String ss = null; 374 375 try { 376 while ((ss = br.readLine()) != null) { 377 int i = 0; 378 379 while (i < ss.length()) { 380 switch (ss.charAt(i)) { 381 case '<': 382 sb.append("<"); 383 i++; 384 385 break; 386 387 case '>': 388 sb.append(">"); 389 i++; 390 391 break; 392 393 case '&': 394 sb.append("&"); 395 i++; 396 397 break; 398 399 case '"': 400 sb.append("""); 401 i++; 402 403 break; 404 405 case ' ': 406 407 if (ss.substring(i).startsWith(" ")) { 409 sb.append(" "); 410 i = i + 2; 411 } else if (ss.substring(i).startsWith(" ")) { 412 sb.append(" "); 413 i = i + 3; 414 } else if (ss.substring(i).startsWith(" ")) { 415 sb.append(" "); 416 i = i + 2; 417 } else { 418 sb.append(' '); 419 i++; 420 } 421 422 break; 423 424 case '\t': 425 sb.append(" "); 426 i++; 427 428 break; 429 430 case '\n': 431 sb.append("<br>"); 432 i++; 433 434 break; 435 436 default: 437 sb.append(ss.charAt(i)); 438 i++; 439 440 break; 441 } 442 } 443 444 sb.append("<br>\n"); 445 } 446 } catch (Exception e) { 447 LOG.severe("Error substituting special characters: " 448 + e.getMessage()); 449 450 return null; } 452 453 return sb.toString(); 454 } 455 456 468 public static String substituteSpecialCharactersInHeaderfields(String s) { 469 StringBuffer sb = new StringBuffer (s.length()); 470 StringReader sr = new StringReader (s); 471 BufferedReader br = new BufferedReader (sr); 472 String ss = null; 473 474 476 479 try { 480 while ((ss = br.readLine()) != null) { 481 int i = 0; 482 483 while (i < ss.length()) { 484 switch (ss.charAt(i)) { 485 case '<': 486 sb.append("<"); 487 i++; 488 489 break; 490 491 case '>': 492 sb.append(">"); 493 i++; 494 495 break; 496 497 case '&': 498 sb.append("&"); 499 i++; 500 501 break; 502 503 case '"': 504 sb.append("""); 505 i++; 506 507 break; 508 509 515 516 case ' ': 517 518 if (ss.substring(i).startsWith(" ")) { 519 sb.append(" "); 520 i = i + 2; 521 } else if (ss.substring(i).startsWith(" ")) { 522 sb.append(" "); 523 i = i + 3; 524 } else if (ss.substring(i).startsWith(" ")) { 525 sb.append(" "); 526 i = i + 2; 527 } else { 528 sb.append(' '); 529 i++; 530 } 531 532 break; 533 534 case '\t': 535 sb.append(" "); 536 i++; 537 538 break; 539 540 case '\n': 541 sb.append("<br>"); 542 i++; 543 544 break; 545 546 default: 547 sb.append(ss.charAt(i)); 548 i++; 549 550 break; 551 } 552 } 553 } 554 } catch (Exception e) { 555 LOG.severe("Error substituting special characters: " 556 + e.getMessage()); 557 558 return null; } 560 561 return sb.toString(); 562 } 563 564 572 public static String validateHTMLString(String input) { 573 StringBuffer output = new StringBuffer (input); 574 int index = 0; 575 576 String lowerCaseInput = input.toLowerCase(); 577 578 if (lowerCaseInput.indexOf("<html>") == -1) { 580 if (lowerCaseInput.indexOf("<!doctype") != -1) { 581 index = lowerCaseInput.indexOf("\n", 582 lowerCaseInput.indexOf("<!doctype")) + 1; 583 } 584 585 output.insert(index, "<html>"); 586 } 587 588 if (lowerCaseInput.indexOf("</html>") == -1) { 590 output.append("</html>"); 591 } 592 593 index = lowerCaseInput.indexOf("</html>"); 595 596 if (lowerCaseInput.length() >= (index + 7)) { 597 lowerCaseInput = lowerCaseInput.substring(0, index + 7); 598 } 599 600 return output.toString(); 601 } 602 603 611 public static String substituteEmailAddress(String s) { 612 return substituteEmailAddress(s, false); 614 } 615 616 633 public static String substituteEmailAddress(String s, boolean ignoreLinks) { 634 if (ignoreLinks) { 635 return substituteEmailAddress(s); 637 } 638 639 Matcher noLinkMatcher = EMAIL_PATTERN.matcher(s); 641 Matcher withLinkMatcher = EMAIL_PATTERN_INC_LINK.matcher(s); 642 int pos = 0; int length = s.length(); 644 StringBuffer buf = new StringBuffer (); 645 646 while (pos < length) { 647 if (noLinkMatcher.find(pos)) { 648 int s1 = noLinkMatcher.start(); 650 int e1 = noLinkMatcher.end(); 651 boolean insertLink; 652 653 if (withLinkMatcher.find(pos)) { 654 int s2 = withLinkMatcher.start(); 656 int e2 = withLinkMatcher.end(); 657 658 if ((s2 < s1) && (e2 > e1)) { 659 buf.append(s.substring(pos, e2)); 661 pos = e2; 662 insertLink = false; } else { 664 insertLink = true; 666 } 667 } else { 668 670 insertLink = true; 671 672 Matcher openLink = Pattern.compile("<a", Pattern.CASE_INSENSITIVE).matcher(s); 677 Matcher closeLink = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE).matcher(s); 678 int linkPos = 0; 679 int savedLinkPos = -1; 680 while (linkPos < s1) { 681 savedLinkPos = linkPos; 682 if (openLink.find(linkPos)) 683 linkPos = openLink.end(); 684 else 685 break; 686 } 687 688 if (savedLinkPos > -1) { 690 if (closeLink.find(savedLinkPos)) { 692 if (closeLink.end() >= s1) { 694 buf.append(s.substring(pos, e1)); 695 pos = e1; 696 insertLink = false; } 698 } 699 } 700 } 701 702 if (insertLink) { 704 String email = s.substring(s1, e1); 705 String link = "<a HREF=\"mailto:" + email + "\">" + email 706 + "</a>"; 707 buf.append(s.substring(pos, s1)); 708 buf.append(link); 709 pos = e1; 710 } 711 } else { 712 buf.append(s.substring(pos)); 714 pos = length; 715 } 716 } 717 718 String result = buf.toString(); 720 LOG.info("Result:\n" + result); 721 722 return result; 723 } 724 725 733 public static String substituteURL(String s) { 734 String match; 735 Matcher m = URL_PATTERN.matcher(s); 736 StringBuffer sb = new StringBuffer (); 737 738 int pos = 0; 739 while (m.find()) { 740 match = m.group(); 741 742 sb.append(s.substring(pos, m.start())); 743 String temp = ""; 744 if( match.matches(".*<\\w+$") && s.length() > m.end() && s.charAt(m.end()) == '>') { 746 temp = match.substring(match.lastIndexOf('<')); 747 match = match.substring(0,match.lastIndexOf('<')); 748 } 749 sb.append("<A HREF=\"" + match + "\">"+ match + "</A>"); 750 sb.append(temp); 751 pos = m.end(); 752 } 753 754 sb.append(s.substring(pos)); 755 756 return sb.toString(); 757 } 758 759 776 public static String substituteURL(String s, boolean ignoreLinks) { 777 if (ignoreLinks) { 778 return substituteURL(s); 780 } 781 782 Matcher noLinkMatcher = URL_PATTERN.matcher(s); 784 Matcher withLinkMatcher = URL_PATTERN_INC_LINK.matcher(s); 785 int pos = 0; int length = s.length(); 787 StringBuffer buf = new StringBuffer (); 788 789 while (pos < length) { 790 if (noLinkMatcher.find(pos)) { 791 int s1 = noLinkMatcher.start(); 793 int e1 = noLinkMatcher.end(); 794 boolean insertLink; 795 796 if (withLinkMatcher.find(pos)) { 797 int s2 = withLinkMatcher.start(); 799 int e2 = withLinkMatcher.end(); 800 801 if ((s2 < s1) && (e2 > e1)) { 802 buf.append(s.substring(pos, e2)); 804 pos = e2; 805 insertLink = false; } else { 807 insertLink = true; 809 } 810 } else { 811 insertLink = true; 813 } 814 815 if (insertLink) { 817 String url = s.substring(s1, e1); 818 String link = "<a HREF=\"" + url + "\">" + url + "</a>"; 819 buf.append(s.substring(pos, s1)); 820 buf.append(link); 821 pos = e1; 822 } 823 } else { 824 buf.append(s.substring(pos)); 826 pos = length; 827 } 828 } 829 830 String result = buf.toString(); 832 LOG.info("Result:\n" + result); 833 834 return result; 835 } 836 837 846 public static String getHtmlBody(String html) { 847 String lowerCaseContent = html.toLowerCase(); 849 int tagStart = lowerCaseContent.indexOf("<body"); 850 851 int tagStartClose = lowerCaseContent.indexOf(">", tagStart) + 1; 853 int tagEnd = lowerCaseContent.indexOf("</body>"); 854 855 if (tagStartClose < 0) { 857 tagStartClose = 0; 858 } 859 860 if ((tagEnd < 0) || (tagEnd > lowerCaseContent.length())) { 861 tagEnd = lowerCaseContent.length(); 862 } 863 864 return html.substring(tagStartClose, tagEnd); 866 } 867 868 876 public static String removeComments(String html) { 877 return COMMENTS_REMOVAL_PATTERN.matcher(html).replaceAll(""); 879 } 880 } 881 | Popular Tags |