HtmlParser


1   //The contents of this file are subject to the Mozilla Public License Version 1.1
2   //(the "License"); you may not use this file except in compliance with the
3   //License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
4   //
5   //Software distributed under the License is distributed on an "AS IS" basis,
6   //WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
7   //for the specific language governing rights and
8   //limitations under the License.
9   //
10  //The Original Code is "The Columba Project"
11  //
12  //The Initial Developers of the Original Code are Frederik Dietz and Timo Stich.
13  //Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
14  //
15  //All Rights Reserved.
16  package org.columba.mail.parser.text;
17  
18  import java.io.BufferedReader  ;
19  import java.io.StringReader  ;
20  import java.nio.ByteBuffer  ;
21  import java.nio.charset.Charset  ;
22  import java.util.logging.Logger  ;
23  import java.util.regex.Matcher  ;
24  import java.util.regex.Pattern  ;
25  
26  
27  /**
28   * Contains different utility functions for manipulating Html based
29   * text. This includes functionality for removing and restoring
30   * special entities (such as &, <, >, ...) and functionality for
31   * removing html tags from the text.
32   *
33   * @author Karl Peder Olesen (karlpeder), 20030623
34   *
35   */
36  public final class HtmlParser {
37  
38      /**
39       * Utility classes should not have a public constructor.
40       */
41      private HtmlParser() {
42      }
43  
44      private static final Logger   LOG = Logger.getLogger("org.columba.mail.parser.text");
45  
46      private static final Pattern   BREAK_TO_NL_PATTERN = Pattern.compile("</?br>",
47              Pattern.CASE_INSENSITIVE);
48      private static final Pattern   P_TO_DOUBLE_NL_PATTERN = Pattern.compile("</p>",
49              Pattern.CASE_INSENSITIVE);
50      private static final Pattern   DIV_TO_DOUBLE_NL_PATTERN = Pattern.compile("</div>",
51              Pattern.CASE_INSENSITIVE);
52      private static final Pattern   H_TO_DOUBLE_NL_PATTERN = Pattern.compile("</h\\d>",
53              Pattern.CASE_INSENSITIVE);
54      private static final Pattern   WHITE_SPACE_REMOVAL_PATTERN = Pattern.compile("\\s+",
55              Pattern.CASE_INSENSITIVE);
56      private static final Pattern   TRIM_SPACE_PATTERN = Pattern.compile("\n\\s+",
57              Pattern.CASE_INSENSITIVE);
58      private static final Pattern   HEADER_REMOVAL_PATTERN = Pattern.compile("<html[^<]*<body[^>]*>",
59              Pattern.CASE_INSENSITIVE);
60      private static final Pattern   STRIP_TAGS_PATTERN = Pattern.compile("<[^>]*>",
61              Pattern.CASE_INSENSITIVE);
62      private static final Pattern   COMMENTS_REMOVAL_PATTERN = Pattern.compile("<!--[^-]*-->",
63              Pattern.CASE_INSENSITIVE);
64      private static final String   EMAIL_STR = "([a-zA-Z0-9_+\\.-]+@([a-zA-Z0-9]+([\\.-][a-zA-Z0-9]+)*)+\\.[a-zA-Z]{2,4})";
65  //do the bug [997599] "\\b([^\\s@]+@[^\\s]+)\\b";
66      private static final Pattern   EMAIL_PATTERN = Pattern.compile(EMAIL_STR);
67      private static final Pattern   EMAIL_PATTERN_INC_LINK = Pattern.compile(
68              "<a[\\s\\n]*href=(\\\")?(mailto:)" + 
69              EMAIL_STR 
70              + "[^<]*</a>",
71              Pattern.CASE_INSENSITIVE);
72  
73      private static final String   PROT = "(http|https|ftp)";
74      private static final String   PUNC = ".,:;?!\\-";
75      private static final String   ANY = "\\S";
76      private static final String   URL_STR = "\\b" + "(" + "(\\w*(:\\S*)?@)?" + PROT
77          + "://" + "[" + ANY + "]+" + ")" + "\\b";
78  
79      /*
80               \\b  Start at word boundary
81           (
82  (\\w*(:\\S*)?@)?  [user:[pass]]@ - Construct
83  prot + "://  protocol and ://
84         ["+any+"]  match literaly anything...
85           )
86   (?=\\s|$)  ...until we find whitespace or end of String
87  */
88      private static final Pattern   URL_PATTERN = Pattern.compile(URL_STR,
89              Pattern.CASE_INSENSITIVE);
90      private static final String   URL_REPAIR_STR = "(.*://.*?)" + "(" + "(&gt;).*|"
91          + "([" + PUNC + "]*)" + "(<br>)?" + ")$";
92  
93      /*
94  (.*://.*?)"  "something" with ://
95            (could be .*? but then the Pattern would match whitespace)
96               (
97        (&gt;).*  a html-Encoded > followed by anything
98                                                    |  or
99  (["+punc+"]*)"  any Punctuation
100         (<br>)? 0 or 1 trailing <br>
101              )$  end of String
102 */
103     private static final Pattern   URL_REPAIR_PATTERN = Pattern.compile(URL_REPAIR_STR);
104     private static final Pattern   URL_PATTERN_INC_LINK = Pattern.compile(
105             "<a( |\\n)*?href=(\\\")?" + URL_STR + "(.|\\n)*?</a>",
106             Pattern.CASE_INSENSITIVE);
107 
108     // TODO (@author fdietz): Add more special entities - e.g. accenture chars such as ?
109 
110     /** Special entities recognized by restore special entities */
111     // The form of the entities must be a regexp!
112     private static final String  [] SPECIAL_ENTITIES = {
113             "&quot;", "&amp;", "&lt;", "&gt;",
114             "&nbsp;","&iexcl;","&cent;","&pound;","&curren;","&yen;","&brvbar;","&sect;",
115             "&uml;","&copy;","&ordf;","&laquo;","&not;","&shy;","&reg;","&macr;",
116             "&deg;","&plusmn;","&sup2;","&sup3;","&acute;","&micro;","&para;","&middot;",
117             "&cedil;","&sup1;","&ordm;","&raquo;","&frac14;","&frac12;","&frac34;","&iquest;",
118             "&Agrave;","&Aacute;","&Acirc;","&Atilde;","&Auml;","&Aring;","&AElig;","&Ccedil;",
119             "&Egrave;","&Eacute;","&Ecirc;","&Euml;","&Igrave;","&Iacute;","&Icirc;","&Iuml;",
120             "&ETH;","&Ntilde;","&Ograve;","&Oacute;","&Ocirc;","&Otilde;","&Ouml;","&times;",
121             "&Oslash;","&Ugrave;","&Uacute;","&Ucirc;","&Uuml;","&Yacute;","&THORN;","&szlig;",
122             "&agrave;","&aacute;","&acirc;","&atilde;","&auml;","&aring;","&aelig;","&ccedil;",
123             "&egrave;","&eacute;","&ecirc;","&euml;","&igrave;","&iacute;","&icirc;","&iuml;",
124             "&eth;","&ntilde;","&ograve;","&oacute;","&ocirc;","&otilde;","&ouml;","&divide;",
125             "&oslash;","&ugrave;","&uacute;","&ucirc;","&uuml;","&yacute;","&thorn;","&yuml;"    };
126 
127     /** Normal chars corresponding to the defined special entities */
128     private static final String  [] ENTITY_STRINGS = {
129             "\"", "&", "<", ">",
130             "\u00a0","\u00a1","\u00a2","\u00a3","\u00a4","\u00a5","\u00a6","\u00a7",
131             "\u00a8","\u00a9","\u00aa","\u00ab","\u00ac","\u00ad","\u00ae","\u00af",
132             "\u00b0","\u00b1","\u00b2","\u00b3","\u00b4","\u00b5","\u00b6","\u00b7",
133             "\u00b8","\u00b9","\u00ba","\u00bb","\u00bc","\u00bd","\u00be","\u00bf",
134             "\u00c0","\u00c1","\u00c2","\u00c3","\u00c4","\u00c5","\u00c6","\u00c7",
135             "\u00c8","\u00c9","\u00ca","\u00cb","\u00cc","\u00cd","\u00ce","\u00cf",
136             "\u00d0","\u00d1","\u00d2","\u00d3","\u00d4","\u00d5","\u00d6","\u00d7",
137             "\u00d8","\u00d9","\u00da","\u00db","\u00dc","\u00dd","\u00de","\u00df",
138             "\u00e0","\u00e1","\u00e2","\u00e3","\u00e4","\u00e5","\u00e6","\u00e7",
139             "\u00e8","\u00e9","\u00ea","\u00eb","\u00ec","\u00ed","\u00ee","\u00ef",
140             "\u00f0","\u00f1","\u00f2","\u00f3","\u00f4","\u00f5","\u00f6","\u00f7",
141             "\u00f8","\u00f9","\u00fa","\u00fb","\u00fc","\u00fd","\u00fe","\u00ff"
142             };
143 
144     private static final Pattern   SPECIAL_PATTERN = Pattern.compile("&#(\\d+);");    
145 
146     private static final Pattern   CHARSET_PATTERN=Pattern.compile("\\bcharset=([\\w-_\\d]+)\\b");
147     
148     
149     
150     /**
151      * Strips html tags and removes extra spaces which occurs due
152      * to e.g. indentation of the html and the head section, which does
153      * not contain any textual information.
154      * <br>
155      * The conversion rutine does the following:<br>
156      * 1. Removes the header from the html file, i.e. everything from
157      *    the html tag until and including the starting body tag.<br>
158      * 2. Replaces multiple consecutive whitespace characters with a single
159      *    space (since extra whitespace should be ignored in html).<br>
160      * 3. Replaces ending br tags with a single newline character<br>
161      * 4. Replaces ending p, div and heading tags with two newlines characters;
162      *    resulting in a single empty line btw. paragraphs.<br>
163      * 5. Strips remaining html tags.<br>
164      * <br>
165      * NB: The tag stripping is done using a very simple regular expression,
166      * which removes everything between &lt and &gt. Therefore too much text
167      * could in some (hopefully rare!?) cases be removed.
168      *
169      * @param        s                Input string
170      * @return        Input stripped for html tags
171      * @author        Karl Peder Olesen (karlpeder)
172      */
173     public static String   stripHtmlTags(String   s) {
174         // initial check of input:
175         if (s == null) {
176             return null;
177         }
178 
179         // remove header
180         s = HEADER_REMOVAL_PATTERN.matcher(s).replaceAll("");
181 
182         // remove extra whitespace
183         s = WHITE_SPACE_REMOVAL_PATTERN.matcher(s).replaceAll(" ");
184 
185         // replace br, p and heading tags with newlines
186         s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
187         s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
188         s = DIV_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
189         s = H_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
190 
191         // strip remaining tags
192         s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
193 
194         // tag stripping can leave some double spaces at line beginnings
195         s = TRIM_SPACE_PATTERN.matcher(s).replaceAll("\n").trim();
196 
197         return s;
198     }
199 
200     /**
201      * Strips html tags. The method used is very simple:
202      * Everything between tag-start (&lt) and tag-end (&gt) is removed.
203      * Optionaly br tags are replaced by newline and ending p tags with
204      * double newline.
205      *
206      * @param        s                        input string
207      * @param        breakToNl        if true, newlines are inserted for br and p tags
208      * @return        output without html tags (null on error)
209      * @author        karlpeder, 20030623
210      *                         (moved from org.columba.mail.gui.message.util.DocumentParser)
211      *
212      * @deprecated        Please use the more advanced and correct
213      *              @see stripHtmlTags(String) method
214      */
215     public static String   stripHtmlTags(String   s, boolean breakToNl) {
216         // initial check of input:
217         if (s == null) {
218             return null;
219         }
220 
221         if (breakToNl) {
222             // replace <br> and </br> with newline
223             s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
224 
225             // replace </p> with double newline
226             s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
227         }
228 
229         // strip tags
230         s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
231 
232         return s;
233     }
234 
235     /**
236      * Performs in large terms the reverse of
237      * substituteSpecialCharacters (though br tags are not
238      * converted to newlines, this should be handled separately).
239      * More preciesly it changes special entities like
240      * amp, nbsp etc. to their real counter parts: &, space etc.
241      * <br>
242      * This includes transformation of special (language specific) chars
243      * such as the Danish ? ? ? ? ? ?.
244      *
245      * @param        s        input string
246      * @return        output with special entities replaced with their
247      *                         "real" counter parts (null on error)
248      * @author  karlpeder, 20030623
249      *                         (moved from org.columba.mail.gui.message.util.DocumentParser)
250      */
251     public static String   restoreSpecialCharacters(Charset   charset, String   s) {
252 
253         //First replace all special entities
254         for( int i=0; i<SPECIAL_ENTITIES.length; i++) {
255             s = s.replaceAll(SPECIAL_ENTITIES[i],ENTITY_STRINGS[i]);
256         }
257 
258         StringBuffer   result = new StringBuffer  (s.length());
259         
260         //replace the other entities
261         Matcher   matcher = SPECIAL_PATTERN.matcher(s);
262         while( matcher.find()) {
263             matcher.appendReplacement(result, charset.decode( ByteBuffer.wrap(new byte[]{ (byte) Integer.parseInt(matcher.group(1))})).toString());         
264         }
265         matcher.appendTail(result);
266         
267         //Convert 4 WS in a row to a tab
268         return result.toString().replaceAll("    ","\t");
269     }
270 
271     public static Charset   getHtmlCharset(String   htmlSource) {
272         Matcher   matcher = CHARSET_PATTERN.matcher(htmlSource);
273         if( matcher.find() ) {
274             try {
275                 return Charset.forName(matcher.group(1));
276             } catch (RuntimeException   e) {
277             }
278         }
279         
280         return Charset.forName(System.getProperty("file.encoding"));
281     }
282     
283     /**
284      * Strips html tags. and replaces special entities with their
285      * "normal" counter parts, e.g. <code>&gt; => ></code>.<br>
286      * Calling this method is the same as calling first stripHtmlTags
287      * and then restoreSpecialCharacters.
288      *
289      * @param        html        input string
290      * @return        output without html tags and special entities
291      *                         (null on error)
292      * @author        karlpeder, 20030623
293      *                         (moved from org.columba.mail.parser.text.BodyTextParser)
294      */
295     public static String   htmlToText(String   html) {
296         // stripHtmlTags called with true ~ p & br => newlines
297         Charset   charset = getHtmlCharset(html);
298         
299         String   text = stripHtmlTags(html);
300 
301         return restoreSpecialCharacters(charset, text);
302     }
303     /**
304      * Replaces special chars - <,>,&,\t,\n," - with the special
305      * entities used in html (amp, nbsp, ...). Then the complete
306      * text is surrounded with proper html tags: Starting- and
307      * ending html tag, header section and body section.
308      * The complete body section is sorround with p tags.
309      * <br>
310      * This is the same as first calling substituteSpecialCharacters
311      * and then add starting and ending html tags etc.
312      * <br>
313      * Further more urls and email adresses are converted into links
314      * Optionally a title and css definition is inserted in the
315      * html header.
316      * <br>
317      *
318      * TODO (@author fdietz): Add support for smilies and coloring of quoted text
319      *
320      * @param        text        Text to convert to html
321      * @param        title        Title to include in header, not used if null
322      * @param        css                Style sheet def. to include in header,
323      *                                         not used if null.
324      *                                         The input shall not include the style tag
325      * @return        Text converted to html
326      * @author        Karl Peder Olesen (karlpeder), 20030916
327      */
328     public static String   textToHtml(String   text, String   title, String   css, String   charset) {
329         // convert special characters
330         String   html = HtmlParser.substituteSpecialCharacters(text);
331 
332         // parse for urls / email adresses and substite with HTML-code
333         // html = HtmlParser.substituteURL(html);
334         // html = HtmlParser.substituteEmailAddress(html);
335 
336         // insert surrounding html tags
337         StringBuffer   buf = new StringBuffer  ();
338         buf.append("<html><head>");
339         buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=" + charset +"\">");
340         
341         if (title != null) {
342             buf.append("<title>");
343             buf.append(title);
344             buf.append("</title>");
345         }
346 
347         if (css != null) {
348             buf.append("<style type=\"text/css\"><!-- ");
349             buf.append(css);
350             buf.append(" --></style>");
351         }
352 
353         buf.append("</head><body><p>");
354         buf.append(html);
355         buf.append("</p></body></html>");
356 
357         return buf.toString();
358     }
359 
360     /**
361      * Substitute special characters like:
362      * <,>,&,\t,\n,"
363      * with special entities used in html (amp, nbsp, ...)
364      *
365      * @param        s        input string containing special characters
366      * @return        output with special characters substituted
367      *                         (null on error)
368      */
369     public static String   substituteSpecialCharacters(String   s) {
370         StringBuffer   sb = new StringBuffer  (s.length());
371         StringReader   sr = new StringReader  (s);
372         BufferedReader   br = new BufferedReader  (sr);
373         String   ss = null;
374 
375         try {
376             while ((ss = br.readLine()) != null) {
377                 int i = 0;
378 
379                 while (i < ss.length()) {
380                     switch (ss.charAt(i)) {
381                     case '<':
382                         sb.append("&lt;");
383                         i++;
384 
385                         break;
386 
387                     case '>':
388                         sb.append("&gt;");
389                         i++;
390 
391                         break;
392 
393                     case '&':
394                         sb.append("&amp;");
395                         i++;
396 
397                         break;
398 
399                     case '"':
400                         sb.append("&quot;");
401                         i++;
402 
403                         break;
404 
405                     case ' ':
406 
407                         //sb.append("&nbsp;");
408                         if (ss.substring(i).startsWith("    ")) {
409                             sb.append("&nbsp; ");
410                             i = i + 2;
411                         } else if (ss.substring(i).startsWith("   ")) {
412                             sb.append("&nbsp;&nbsp; ");
413                             i = i + 3;
414                         } else if (ss.substring(i).startsWith("  ")) {
415                             sb.append("&nbsp; ");
416                             i = i + 2;
417                         } else {
418                             sb.append(' ');
419                             i++;
420                         }
421 
422                         break;
423 
424                     case '\t':
425                         sb.append("&nbsp;&nbsp;&nbsp;&nbsp;");
426                         i++;
427 
428                         break;
429 
430                     case '\n':
431                         sb.append("<br>");
432                         i++;
433 
434                         break;
435 
436                     default:
437                         sb.append(ss.charAt(i));
438                         i++;
439 
440                         break;
441                     }
442                 }
443 
444                 sb.append("<br>\n");
445             }
446         } catch (Exception   e) {
447             LOG.severe("Error substituting special characters: "
448                     + e.getMessage());
449 
450             return null; // error
451         }
452 
453         return sb.toString();
454     }
455 
456     /**
457      *
458      * substitute special characters like:
459      * <,>,&,\t,\n
460      * with special entities used in html<br>
461      * This is the same as substituteSpecialCharacters, but
462      * here an extra newline character is not inserted.
463      *
464      * @param        s        input string containing special characters
465      * @return        output with special characters substituted
466      *                         (null on error)
467      */
468     public static String   substituteSpecialCharactersInHeaderfields(String   s) {
469         StringBuffer   sb = new StringBuffer  (s.length());
470         StringReader   sr = new StringReader  (s);
471         BufferedReader   br = new BufferedReader  (sr);
472         String   ss = null;
473 
474         // TODO (@author karlpeder): Extend handling of special entities as in restoreSpecialCharacters
475 
476         /*
477  * *20030623, karlpeder* " and space handled also
478  */
479         try {
480             while ((ss = br.readLine()) != null) {
481                 int i = 0;
482 
483                 while (i < ss.length()) {
484                     switch (ss.charAt(i)) {
485                     case '<':
486                         sb.append("&lt;");
487                         i++;
488 
489                         break;
490 
491                     case '>':
492                         sb.append("&gt;");
493                         i++;
494 
495                         break;
496 
497                     case '&':
498                         sb.append("&amp;");
499                         i++;
500 
501                         break;
502 
503                     case '"':
504                         sb.append("&quot;");
505                         i++;
506 
507                         break;
508 
509                         /*
510                     case '\'':
511                         sb.append("&apos;");
512                         i++;
513 
514                         break;*/
515 
516                     case ' ':
517 
518                         if (ss.substring(i).startsWith("    ")) {
519                             sb.append("&nbsp; ");
520                             i = i + 2;
521                         } else if (ss.substring(i).startsWith("   ")) {
522                             sb.append("&nbsp;&nbsp; ");
523                             i = i + 3;
524                         } else if (ss.substring(i).startsWith("  ")) {
525                             sb.append("&nbsp; ");
526                             i = i + 2;
527                         } else {
528                             sb.append(' ');
529                             i++;
530                         }
531 
532                         break;
533 
534                     case '\t':
535                         sb.append("&nbsp;&nbsp;&nbsp;&nbsp;");
536                         i++;
537 
538                         break;
539 
540                     case '\n':
541                         sb.append("<br>");
542                         i++;
543 
544                         break;
545 
546                     default:
547                         sb.append(ss.charAt(i));
548                         i++;
549 
550                         break;
551                     }
552                 }
553             }
554         } catch (Exception   e) {
555             LOG.severe("Error substituting special characters: "
556                     + e.getMessage());
557 
558             return null; // error
559         }
560 
561         return sb.toString();
562     }
563 
564     /**
565      * Tries to fix broken html-strings by inserting
566      * html start- and end tags if missing, and by
567      * removing content after the html end tag.
568      *
569      * @param        input        html content to be validated
570      * @return        content with extra tags inserted if necessary
571      */
572     public static String   validateHTMLString(String   input) {
573         StringBuffer   output = new StringBuffer  (input);
574         int index = 0;
575 
576         String   lowerCaseInput = input.toLowerCase();
577 
578         // Check for missing  <html> tag
579         if (lowerCaseInput.indexOf("<html>") == -1) {
580             if (lowerCaseInput.indexOf("<!doctype") != -1) {
581                 index = lowerCaseInput.indexOf("\n",
582                         lowerCaseInput.indexOf("<!doctype")) + 1;
583             }
584 
585             output.insert(index, "<html>");
586         }
587 
588         // Check for missing  </html> tag
589         if (lowerCaseInput.indexOf("</html>") == -1) {
590             output.append("</html>");
591         }
592 
593         // remove characters after </html> tag
594         index = lowerCaseInput.indexOf("</html>");
595 
596         if (lowerCaseInput.length() >= (index + 7)) {
597             lowerCaseInput = lowerCaseInput.substring(0, index + 7);
598         }
599 
600         return output.toString();
601     }
602 
603     /**
604      * parse text and transform every email-address
605      * in a HTML-conform address
606      *
607      * @param        s        input text
608      * @return        text with email-adresses transformed to links
609      *                         (null on error)
610      */
611     public static String   substituteEmailAddress(String   s) {
612         // due to bug CA-174 changed: return EMAIL_PATTERN.matcher(s).replaceAll("<A HREF=\"mailto:$1\">$1</A>");
613         return substituteEmailAddress(s, false);
614     }
615 
616     /**
617      * Transforms email-addresses into HTML just as
618      * substituteEmailAddress(String), but tries to ignore email-addresses,
619      * which are already links, if the ignore links flag is set.
620      * <br>
621      * This extended functionality is necessary when parsing a text which
622      * is already (partly) html.
623      * <br>
624      * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
625      *
626      * @param         s                                input text
627      * @param        ignoreLinks                if true link tags are ignored. This gives a
628      *                                                         wrong result if some e-mail adresses are
629      *                                                         already links (but uses reg. expr. directly,
630      *                                                         and is therefore faster)
631      * @return        text with email-adresses transformed to links
632      */
633     public static String   substituteEmailAddress(String   s, boolean ignoreLinks) {
634         if (ignoreLinks) {
635             // Do not take existing link tags into account
636             return substituteEmailAddress(s);
637         }
638 
639         // initialisation
640         Matcher   noLinkMatcher = EMAIL_PATTERN.matcher(s);
641         Matcher   withLinkMatcher = EMAIL_PATTERN_INC_LINK.matcher(s);
642         int pos = 0; // current position in s
643         int length = s.length();
644         StringBuffer   buf = new StringBuffer  ();
645 
646         while (pos < length) {
647             if (noLinkMatcher.find(pos)) {
648                 // an email adress was found - check whether its already a link
649                 int s1 = noLinkMatcher.start();
650                 int e1 = noLinkMatcher.end();
651                 boolean insertLink;
652 
653                 if (withLinkMatcher.find(pos)) {
654                     // found an email address with links - is it the same?
655                     int s2 = withLinkMatcher.start();
656                     int e2 = withLinkMatcher.end();
657                     
658                     if ((s2 < s1) && (e2 > e1)) {
659                         // same email adress - just append and continue
660                         buf.append(s.substring(pos, e2));
661                         pos = e2;
662                         insertLink = false; // already handled
663                     } else {
664                         // not the same
665                         insertLink = true;
666                     }
667                 } else {
668                     // no match with mailto link tags
669 
670                     insertLink = true;
671                     
672                     // can be an email address in a link BUG CA-174
673                     // fix that with looking for an open link in the same line before
674                     // on the way from left to the current position of the email at s1
675                     // find the last open link <a
676                     Matcher   openLink = Pattern.compile("<a", Pattern.CASE_INSENSITIVE).matcher(s);
677                     Matcher   closeLink = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE).matcher(s);
678                     int linkPos = 0;
679                     int savedLinkPos = -1;
680                     while (linkPos < s1) {
681                         savedLinkPos = linkPos;
682                         if (openLink.find(linkPos)) 
683                             linkPos = openLink.end();
684                         else
685                             break;
686                     }
687                     
688                     // found an open link
689                     if (savedLinkPos > -1) {
690                         // check if it is closed
691                         if (closeLink.find(savedLinkPos)) {
692                             // if the closing mark is after the s1 mark do not insert a link
693                             if (closeLink.end() >= s1) {
694                                 buf.append(s.substring(pos, e1));
695                                 pos = e1;
696                                 insertLink = false; // already handled
697                             }
698                         }
699                     }
700                 }
701 
702                 // shall we insert a link?
703                 if (insertLink) {
704                     String   email = s.substring(s1, e1);
705                     String   link = "<a HREF=\"mailto:" + email + "\">" + email
706                             + "</a>";
707                     buf.append(s.substring(pos, s1));
708                     buf.append(link);
709                     pos = e1;
710                 }
711             } else {
712                 // no more matches - append rest of string
713                 buf.append(s.substring(pos));
714                 pos = length;
715             }
716         }
717 
718         // return result
719         String   result = buf.toString();
720         LOG.info("Result:\n" + result);
721 
722         return result;
723     }
724 
725     /**
726      * parse text and transform every url
727      * in a HTML-conform url
728      *
729      * @param        s        input text
730      * @return        text with urls transformed to links
731      *                         (null on error)
732      */
733     public static String   substituteURL(String   s) {
734         String   match;
735         Matcher   m = URL_PATTERN.matcher(s);
736         StringBuffer   sb = new StringBuffer  ();
737 
738         int pos = 0;
739         while (m.find()) {
740             match = m.group();
741             
742             sb.append(s.substring(pos, m.start()));
743             String   temp = "";
744             // Test if there is a trailing html tag
745             if( match.matches(".*<\\w+$") && s.length() > m.end() && s.charAt(m.end()) == '>') {
746                 temp = match.substring(match.lastIndexOf('<'));
747                 match = match.substring(0,match.lastIndexOf('<'));
748             }            
749             sb.append("<A HREF=\"" + match + "\">"+ match + "</A>");
750             sb.append(temp);
751             pos = m.end();
752         }
753 
754         sb.append(s.substring(pos));
755         
756         return sb.toString();
757     }
758 
759     /**
760      * Transforms urls into HTML just as substituteURL(String),
761      * but tries to ignore urls, which are already links, if the ignore
762      * links flag is set.
763      * <br>
764      * This extended functionality is necessary when parsing a text which
765      * is already (partly) html.
766      * <br>
767      * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
768      *
769      * @param         s                                input text
770      * @param        ignoreLinks                if true link tags are ignored. This gives a
771      *                                                         wrong result if some urls are already links
772      *                                                         (but uses reg. expr. directly, and is
773      *                                                         therefore faster)
774      * @return        text with urls
775      */
776     public static String   substituteURL(String   s, boolean ignoreLinks) {
777         if (ignoreLinks) {
778             // Do not take existing link tags into account
779             return substituteURL(s);
780         }
781 
782         // initialisation
783         Matcher   noLinkMatcher = URL_PATTERN.matcher(s);
784         Matcher   withLinkMatcher = URL_PATTERN_INC_LINK.matcher(s);
785         int pos = 0; // current position in s
786         int length = s.length();
787         StringBuffer   buf = new StringBuffer  ();
788 
789         while (pos < length) {
790             if (noLinkMatcher.find(pos)) {
791                 // an url - check whether its already a link
792                 int s1 = noLinkMatcher.start();
793                 int e1 = noLinkMatcher.end();
794                 boolean insertLink;
795 
796                 if (withLinkMatcher.find(pos)) {
797                     // found an url with links - is it the same?
798                     int s2 = withLinkMatcher.start();
799                     int e2 = withLinkMatcher.end();
800 
801                     if ((s2 < s1) && (e2 > e1)) {
802                         // same url - just append and continue
803                         buf.append(s.substring(pos, e2));
804                         pos = e2;
805                         insertLink = false; // already handled
806                     } else {
807                         // not the same
808                         insertLink = true;
809                     }
810                 } else {
811                     // no match with link tags
812                     insertLink = true;
813                 }
814 
815                 // shall we insert a link?
816                 if (insertLink) {
817                     String   url = s.substring(s1, e1);
818                     String   link = "<a HREF=\"" + url + "\">" + url + "</a>";
819                     buf.append(s.substring(pos, s1));
820                     buf.append(link);
821                     pos = e1;
822                 }
823             } else {
824                 // no more matches - append rest of string
825                 buf.append(s.substring(pos));
826                 pos = length;
827             }
828         }
829 
830         // return result
831         String   result = buf.toString();
832         LOG.info("Result:\n" + result);
833 
834         return result;
835     }
836 
837     /**
838      * Extracts the body of a html document, i.e. the html contents
839      * between (and not including) body start and end tags.
840      *
841      * @param        html        The html document to extract the body from
842      * @return       The body of the html document
843      *
844      * @author        Karl Peder Olesen (karlpeder)
845      */
846     public static String   getHtmlBody(String   html) {
847         // locate body start- and end tags
848         String   lowerCaseContent = html.toLowerCase();
849         int tagStart = lowerCaseContent.indexOf("<body");
850 
851         // search for closing bracket separately to account for attributes in tag
852         int tagStartClose = lowerCaseContent.indexOf(">", tagStart) + 1;
853         int tagEnd = lowerCaseContent.indexOf("</body>");
854 
855         // correct limits if body tags where not found
856         if (tagStartClose < 0) {
857             tagStartClose = 0;
858         }
859 
860         if ((tagEnd < 0) || (tagEnd > lowerCaseContent.length())) {
861             tagEnd = lowerCaseContent.length();
862         }
863 
864         // return body
865         return html.substring(tagStartClose, tagEnd);
866     }
867 
868     /**
869      * Parses a html documents and removes all html comments found.
870      *
871      * @param        html        The html document
872      * @return        Html document without comments
873      *
874      * @author        Karl Peder Olesen (karlpeder)
875      */
876     public static String   removeComments(String   html) {
877         // remove comments
878         return COMMENTS_REMOVAL_PATTERN.matcher(html).replaceAll("");
879     }
880 }
881
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags