KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > columba > mail > parser > text > HtmlParser


1 //The contents of this file are subject to the Mozilla Public License Version 1.1
2
//(the "License"); you may not use this file except in compliance with the
3
//License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
4
//
5
//Software distributed under the License is distributed on an "AS IS" basis,
6
//WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
7
//for the specific language governing rights and
8
//limitations under the License.
9
//
10
//The Original Code is "The Columba Project"
11
//
12
//The Initial Developers of the Original Code are Frederik Dietz and Timo Stich.
13
//Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
14
//
15
//All Rights Reserved.
16
package org.columba.mail.parser.text;
17
18 import java.io.BufferedReader JavaDoc;
19 import java.io.StringReader JavaDoc;
20 import java.nio.ByteBuffer JavaDoc;
21 import java.nio.charset.Charset JavaDoc;
22 import java.util.logging.Logger JavaDoc;
23 import java.util.regex.Matcher JavaDoc;
24 import java.util.regex.Pattern JavaDoc;
25
26
27 /**
28  * Contains different utility functions for manipulating Html based
29  * text. This includes functionality for removing and restoring
30  * special entities (such as &, <, >, ...) and functionality for
31  * removing html tags from the text.
32  *
33  * @author Karl Peder Olesen (karlpeder), 20030623
34  *
35  */

36 public final class HtmlParser {
37
38     /**
39      * Utility classes should not have a public constructor.
40      */

41     private HtmlParser() {
42     }
43
44     private static final Logger JavaDoc LOG = Logger.getLogger("org.columba.mail.parser.text");
45
46     private static final Pattern JavaDoc BREAK_TO_NL_PATTERN = Pattern.compile("</?br>",
47             Pattern.CASE_INSENSITIVE);
48     private static final Pattern JavaDoc P_TO_DOUBLE_NL_PATTERN = Pattern.compile("</p>",
49             Pattern.CASE_INSENSITIVE);
50     private static final Pattern JavaDoc DIV_TO_DOUBLE_NL_PATTERN = Pattern.compile("</div>",
51             Pattern.CASE_INSENSITIVE);
52     private static final Pattern JavaDoc H_TO_DOUBLE_NL_PATTERN = Pattern.compile("</h\\d>",
53             Pattern.CASE_INSENSITIVE);
54     private static final Pattern JavaDoc WHITE_SPACE_REMOVAL_PATTERN = Pattern.compile("\\s+",
55             Pattern.CASE_INSENSITIVE);
56     private static final Pattern JavaDoc TRIM_SPACE_PATTERN = Pattern.compile("\n\\s+",
57             Pattern.CASE_INSENSITIVE);
58     private static final Pattern JavaDoc HEADER_REMOVAL_PATTERN = Pattern.compile("<html[^<]*<body[^>]*>",
59             Pattern.CASE_INSENSITIVE);
60     private static final Pattern JavaDoc STRIP_TAGS_PATTERN = Pattern.compile("<[^>]*>",
61             Pattern.CASE_INSENSITIVE);
62     private static final Pattern JavaDoc COMMENTS_REMOVAL_PATTERN = Pattern.compile("<!--[^-]*-->",
63             Pattern.CASE_INSENSITIVE);
64     private static final String JavaDoc EMAIL_STR = "([a-zA-Z0-9_+\\.-]+@([a-zA-Z0-9]+([\\.-][a-zA-Z0-9]+)*)+\\.[a-zA-Z]{2,4})";
65 //do the bug [997599] "\\b([^\\s@]+@[^\\s]+)\\b";
66
private static final Pattern JavaDoc EMAIL_PATTERN = Pattern.compile(EMAIL_STR);
67     private static final Pattern JavaDoc EMAIL_PATTERN_INC_LINK = Pattern.compile(
68             "<a[\\s\\n]*href=(\\\")?(mailto:)" +
69             EMAIL_STR
70             + "[^<]*</a>",
71             Pattern.CASE_INSENSITIVE);
72
73     private static final String JavaDoc PROT = "(http|https|ftp)";
74     private static final String JavaDoc PUNC = ".,:;?!\\-";
75     private static final String JavaDoc ANY = "\\S";
76     private static final String JavaDoc URL_STR = "\\b" + "(" + "(\\w*(:\\S*)?@)?" + PROT
77         + "://" + "[" + ANY + "]+" + ")" + "\\b";
78
79     /*
80              \\b Start at word boundary
81          (
82 (\\w*(:\\S*)?@)? [user:[pass]]@ - Construct
83 prot + ":// protocol and ://
84        ["+any+"] match literaly anything...
85          )
86  (?=\\s|$) ...until we find whitespace or end of String
87 */

88     private static final Pattern JavaDoc URL_PATTERN = Pattern.compile(URL_STR,
89             Pattern.CASE_INSENSITIVE);
90     private static final String JavaDoc URL_REPAIR_STR = "(.*://.*?)" + "(" + "(&gt;).*|"
91         + "([" + PUNC + "]*)" + "(<br>)?" + ")$";
92
93     /*
94 (.*://.*?)" "something" with ://
95           (could be .*? but then the Pattern would match whitespace)
96              (
97       (&gt;).* a html-Encoded > followed by anything
98                                                   | or
99 (["+punc+"]*)" any Punctuation
100         (<br>)? 0 or 1 trailing <br>
101              )$ end of String
102 */

103     private static final Pattern JavaDoc URL_REPAIR_PATTERN = Pattern.compile(URL_REPAIR_STR);
104     private static final Pattern JavaDoc URL_PATTERN_INC_LINK = Pattern.compile(
105             "<a( |\\n)*?href=(\\\")?" + URL_STR + "(.|\\n)*?</a>",
106             Pattern.CASE_INSENSITIVE);
107
108     // TODO (@author fdietz): Add more special entities - e.g. accenture chars such as ?
109

110     /** Special entities recognized by restore special entities */
111     // The form of the entities must be a regexp!
112
private static final String JavaDoc[] SPECIAL_ENTITIES = {
113             "&quot;", "&amp;", "&lt;", "&gt;",
114             "&nbsp;","&iexcl;","&cent;","&pound;","&curren;","&yen;","&brvbar;","&sect;",
115             "&uml;","&copy;","&ordf;","&laquo;","&not;","&shy;","&reg;","&macr;",
116             "&deg;","&plusmn;","&sup2;","&sup3;","&acute;","&micro;","&para;","&middot;",
117             "&cedil;","&sup1;","&ordm;","&raquo;","&frac14;","&frac12;","&frac34;","&iquest;",
118             "&Agrave;","&Aacute;","&Acirc;","&Atilde;","&Auml;","&Aring;","&AElig;","&Ccedil;",
119             "&Egrave;","&Eacute;","&Ecirc;","&Euml;","&Igrave;","&Iacute;","&Icirc;","&Iuml;",
120             "&ETH;","&Ntilde;","&Ograve;","&Oacute;","&Ocirc;","&Otilde;","&Ouml;","&times;",
121             "&Oslash;","&Ugrave;","&Uacute;","&Ucirc;","&Uuml;","&Yacute;","&THORN;","&szlig;",
122             "&agrave;","&aacute;","&acirc;","&atilde;","&auml;","&aring;","&aelig;","&ccedil;",
123             "&egrave;","&eacute;","&ecirc;","&euml;","&igrave;","&iacute;","&icirc;","&iuml;",
124             "&eth;","&ntilde;","&ograve;","&oacute;","&ocirc;","&otilde;","&ouml;","&divide;",
125             "&oslash;","&ugrave;","&uacute;","&ucirc;","&uuml;","&yacute;","&thorn;","&yuml;" };
126
127     /** Normal chars corresponding to the defined special entities */
128     private static final String JavaDoc[] ENTITY_STRINGS = {
129             "\"", "&", "<", ">",
130             "\u00a0","\u00a1","\u00a2","\u00a3","\u00a4","\u00a5","\u00a6","\u00a7",
131             "\u00a8","\u00a9","\u00aa","\u00ab","\u00ac","\u00ad","\u00ae","\u00af",
132             "\u00b0","\u00b1","\u00b2","\u00b3","\u00b4","\u00b5","\u00b6","\u00b7",
133             "\u00b8","\u00b9","\u00ba","\u00bb","\u00bc","\u00bd","\u00be","\u00bf",
134             "\u00c0","\u00c1","\u00c2","\u00c3","\u00c4","\u00c5","\u00c6","\u00c7",
135             "\u00c8","\u00c9","\u00ca","\u00cb","\u00cc","\u00cd","\u00ce","\u00cf",
136             "\u00d0","\u00d1","\u00d2","\u00d3","\u00d4","\u00d5","\u00d6","\u00d7",
137             "\u00d8","\u00d9","\u00da","\u00db","\u00dc","\u00dd","\u00de","\u00df",
138             "\u00e0","\u00e1","\u00e2","\u00e3","\u00e4","\u00e5","\u00e6","\u00e7",
139             "\u00e8","\u00e9","\u00ea","\u00eb","\u00ec","\u00ed","\u00ee","\u00ef",
140             "\u00f0","\u00f1","\u00f2","\u00f3","\u00f4","\u00f5","\u00f6","\u00f7",
141             "\u00f8","\u00f9","\u00fa","\u00fb","\u00fc","\u00fd","\u00fe","\u00ff"
142             };
143
144     private static final Pattern JavaDoc SPECIAL_PATTERN = Pattern.compile("&#(\\d+);");
145
146     private static final Pattern JavaDoc CHARSET_PATTERN=Pattern.compile("\\bcharset=([\\w-_\\d]+)\\b");
147     
148     
149     
150     /**
151      * Strips html tags and removes extra spaces which occurs due
152      * to e.g. indentation of the html and the head section, which does
153      * not contain any textual information.
154      * <br>
155      * The conversion rutine does the following:<br>
156      * 1. Removes the header from the html file, i.e. everything from
157      * the html tag until and including the starting body tag.<br>
158      * 2. Replaces multiple consecutive whitespace characters with a single
159      * space (since extra whitespace should be ignored in html).<br>
160      * 3. Replaces ending br tags with a single newline character<br>
161      * 4. Replaces ending p, div and heading tags with two newlines characters;
162      * resulting in a single empty line btw. paragraphs.<br>
163      * 5. Strips remaining html tags.<br>
164      * <br>
165      * NB: The tag stripping is done using a very simple regular expression,
166      * which removes everything between &lt and &gt. Therefore too much text
167      * could in some (hopefully rare!?) cases be removed.
168      *
169      * @param s Input string
170      * @return Input stripped for html tags
171      * @author Karl Peder Olesen (karlpeder)
172      */

173     public static String JavaDoc stripHtmlTags(String JavaDoc s) {
174         // initial check of input:
175
if (s == null) {
176             return null;
177         }
178
179         // remove header
180
s = HEADER_REMOVAL_PATTERN.matcher(s).replaceAll("");
181
182         // remove extra whitespace
183
s = WHITE_SPACE_REMOVAL_PATTERN.matcher(s).replaceAll(" ");
184
185         // replace br, p and heading tags with newlines
186
s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
187         s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
188         s = DIV_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
189         s = H_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
190
191         // strip remaining tags
192
s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
193
194         // tag stripping can leave some double spaces at line beginnings
195
s = TRIM_SPACE_PATTERN.matcher(s).replaceAll("\n").trim();
196
197         return s;
198     }
199
200     /**
201      * Strips html tags. The method used is very simple:
202      * Everything between tag-start (&lt) and tag-end (&gt) is removed.
203      * Optionaly br tags are replaced by newline and ending p tags with
204      * double newline.
205      *
206      * @param s input string
207      * @param breakToNl if true, newlines are inserted for br and p tags
208      * @return output without html tags (null on error)
209      * @author karlpeder, 20030623
210      * (moved from org.columba.mail.gui.message.util.DocumentParser)
211      *
212      * @deprecated Please use the more advanced and correct
213      * @see stripHtmlTags(String) method
214      */

215     public static String JavaDoc stripHtmlTags(String JavaDoc s, boolean breakToNl) {
216         // initial check of input:
217
if (s == null) {
218             return null;
219         }
220
221         if (breakToNl) {
222             // replace <br> and </br> with newline
223
s = BREAK_TO_NL_PATTERN.matcher(s).replaceAll("\n");
224
225             // replace </p> with double newline
226
s = P_TO_DOUBLE_NL_PATTERN.matcher(s).replaceAll("\n\n");
227         }
228
229         // strip tags
230
s = STRIP_TAGS_PATTERN.matcher(s).replaceAll("");
231
232         return s;
233     }
234
235     /**
236      * Performs in large terms the reverse of
237      * substituteSpecialCharacters (though br tags are not
238      * converted to newlines, this should be handled separately).
239      * More preciesly it changes special entities like
240      * amp, nbsp etc. to their real counter parts: &, space etc.
241      * <br>
242      * This includes transformation of special (language specific) chars
243      * such as the Danish ? ? ? ? ? ?.
244      *
245      * @param s input string
246      * @return output with special entities replaced with their
247      * "real" counter parts (null on error)
248      * @author karlpeder, 20030623
249      * (moved from org.columba.mail.gui.message.util.DocumentParser)
250      */

251     public static String JavaDoc restoreSpecialCharacters(Charset JavaDoc charset, String JavaDoc s) {
252
253         //First replace all special entities
254
for( int i=0; i<SPECIAL_ENTITIES.length; i++) {
255             s = s.replaceAll(SPECIAL_ENTITIES[i],ENTITY_STRINGS[i]);
256         }
257
258         StringBuffer JavaDoc result = new StringBuffer JavaDoc(s.length());
259         
260         //replace the other entities
261
Matcher JavaDoc matcher = SPECIAL_PATTERN.matcher(s);
262         while( matcher.find()) {
263             matcher.appendReplacement(result, charset.decode( ByteBuffer.wrap(new byte[]{ (byte) Integer.parseInt(matcher.group(1))})).toString());
264         }
265         matcher.appendTail(result);
266         
267         //Convert 4 WS in a row to a tab
268
return result.toString().replaceAll(" ","\t");
269     }
270
271     public static Charset JavaDoc getHtmlCharset(String JavaDoc htmlSource) {
272         Matcher JavaDoc matcher = CHARSET_PATTERN.matcher(htmlSource);
273         if( matcher.find() ) {
274             try {
275                 return Charset.forName(matcher.group(1));
276             } catch (RuntimeException JavaDoc e) {
277             }
278         }
279         
280         return Charset.forName(System.getProperty("file.encoding"));
281     }
282     
283     /**
284      * Strips html tags. and replaces special entities with their
285      * "normal" counter parts, e.g. <code>&gt; => ></code>.<br>
286      * Calling this method is the same as calling first stripHtmlTags
287      * and then restoreSpecialCharacters.
288      *
289      * @param html input string
290      * @return output without html tags and special entities
291      * (null on error)
292      * @author karlpeder, 20030623
293      * (moved from org.columba.mail.parser.text.BodyTextParser)
294      */

295     public static String JavaDoc htmlToText(String JavaDoc html) {
296         // stripHtmlTags called with true ~ p & br => newlines
297
Charset JavaDoc charset = getHtmlCharset(html);
298         
299         String JavaDoc text = stripHtmlTags(html);
300
301         return restoreSpecialCharacters(charset, text);
302     }
303     /**
304      * Replaces special chars - <,>,&,\t,\n," - with the special
305      * entities used in html (amp, nbsp, ...). Then the complete
306      * text is surrounded with proper html tags: Starting- and
307      * ending html tag, header section and body section.
308      * The complete body section is sorround with p tags.
309      * <br>
310      * This is the same as first calling substituteSpecialCharacters
311      * and then add starting and ending html tags etc.
312      * <br>
313      * Further more urls and email adresses are converted into links
314      * Optionally a title and css definition is inserted in the
315      * html header.
316      * <br>
317      *
318      * TODO (@author fdietz): Add support for smilies and coloring of quoted text
319      *
320      * @param text Text to convert to html
321      * @param title Title to include in header, not used if null
322      * @param css Style sheet def. to include in header,
323      * not used if null.
324      * The input shall not include the style tag
325      * @return Text converted to html
326      * @author Karl Peder Olesen (karlpeder), 20030916
327      */

328     public static String JavaDoc textToHtml(String JavaDoc text, String JavaDoc title, String JavaDoc css, String JavaDoc charset) {
329         // convert special characters
330
String JavaDoc html = HtmlParser.substituteSpecialCharacters(text);
331
332         // parse for urls / email adresses and substite with HTML-code
333
// html = HtmlParser.substituteURL(html);
334
// html = HtmlParser.substituteEmailAddress(html);
335

336         // insert surrounding html tags
337
StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
338         buf.append("<html><head>");
339         buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=" + charset +"\">");
340         
341         if (title != null) {
342             buf.append("<title>");
343             buf.append(title);
344             buf.append("</title>");
345         }
346
347         if (css != null) {
348             buf.append("<style type=\"text/css\"><!-- ");
349             buf.append(css);
350             buf.append(" --></style>");
351         }
352
353         buf.append("</head><body><p>");
354         buf.append(html);
355         buf.append("</p></body></html>");
356
357         return buf.toString();
358     }
359
360     /**
361      * Substitute special characters like:
362      * <,>,&,\t,\n,"
363      * with special entities used in html (amp, nbsp, ...)
364      *
365      * @param s input string containing special characters
366      * @return output with special characters substituted
367      * (null on error)
368      */

369     public static String JavaDoc substituteSpecialCharacters(String JavaDoc s) {
370         StringBuffer JavaDoc sb = new StringBuffer JavaDoc(s.length());
371         StringReader JavaDoc sr = new StringReader JavaDoc(s);
372         BufferedReader JavaDoc br = new BufferedReader JavaDoc(sr);
373         String JavaDoc ss = null;
374
375         try {
376             while ((ss = br.readLine()) != null) {
377                 int i = 0;
378
379                 while (i < ss.length()) {
380                     switch (ss.charAt(i)) {
381                     case '<':
382                         sb.append("&lt;");
383                         i++;
384
385                         break;
386
387                     case '>':
388                         sb.append("&gt;");
389                         i++;
390
391                         break;
392
393                     case '&':
394                         sb.append("&amp;");
395                         i++;
396
397                         break;
398
399                     case '"':
400                         sb.append("&quot;");
401                         i++;
402
403                         break;
404
405                     case ' ':
406
407                         //sb.append("&nbsp;");
408
if (ss.substring(i).startsWith(" ")) {
409                             sb.append("&nbsp; ");
410                             i = i + 2;
411                         } else if (ss.substring(i).startsWith(" ")) {
412                             sb.append("&nbsp;&nbsp; ");
413                             i = i + 3;
414                         } else if (ss.substring(i).startsWith(" ")) {
415                             sb.append("&nbsp; ");
416                             i = i + 2;
417                         } else {
418                             sb.append(' ');
419                             i++;
420                         }
421
422                         break;
423
424                     case '\t':
425                         sb.append("&nbsp;&nbsp;&nbsp;&nbsp;");
426                         i++;
427
428                         break;
429
430                     case '\n':
431                         sb.append("<br>");
432                         i++;
433
434                         break;
435
436                     default:
437                         sb.append(ss.charAt(i));
438                         i++;
439
440                         break;
441                     }
442                 }
443
444                 sb.append("<br>\n");
445             }
446         } catch (Exception JavaDoc e) {
447             LOG.severe("Error substituting special characters: "
448                     + e.getMessage());
449
450             return null; // error
451
}
452
453         return sb.toString();
454     }
455
456     /**
457      *
458      * substitute special characters like:
459      * <,>,&,\t,\n
460      * with special entities used in html<br>
461      * This is the same as substituteSpecialCharacters, but
462      * here an extra newline character is not inserted.
463      *
464      * @param s input string containing special characters
465      * @return output with special characters substituted
466      * (null on error)
467      */

468     public static String JavaDoc substituteSpecialCharactersInHeaderfields(String JavaDoc s) {
469         StringBuffer JavaDoc sb = new StringBuffer JavaDoc(s.length());
470         StringReader JavaDoc sr = new StringReader JavaDoc(s);
471         BufferedReader JavaDoc br = new BufferedReader JavaDoc(sr);
472         String JavaDoc ss = null;
473
474         // TODO (@author karlpeder): Extend handling of special entities as in restoreSpecialCharacters
475

476         /*
477  * *20030623, karlpeder* " and space handled also
478  */

479         try {
480             while ((ss = br.readLine()) != null) {
481                 int i = 0;
482
483                 while (i < ss.length()) {
484                     switch (ss.charAt(i)) {
485                     case '<':
486                         sb.append("&lt;");
487                         i++;
488
489                         break;
490
491                     case '>':
492                         sb.append("&gt;");
493                         i++;
494
495                         break;
496
497                     case '&':
498                         sb.append("&amp;");
499                         i++;
500
501                         break;
502
503                     case '"':
504                         sb.append("&quot;");
505                         i++;
506
507                         break;
508
509                         /*
510                     case '\'':
511                         sb.append("&apos;");
512                         i++;
513
514                         break;*/

515
516                     case ' ':
517
518                         if (ss.substring(i).startsWith(" ")) {
519                             sb.append("&nbsp; ");
520                             i = i + 2;
521                         } else if (ss.substring(i).startsWith(" ")) {
522                             sb.append("&nbsp;&nbsp; ");
523                             i = i + 3;
524                         } else if (ss.substring(i).startsWith(" ")) {
525                             sb.append("&nbsp; ");
526                             i = i + 2;
527                         } else {
528                             sb.append(' ');
529                             i++;
530                         }
531
532                         break;
533
534                     case '\t':
535                         sb.append("&nbsp;&nbsp;&nbsp;&nbsp;");
536                         i++;
537
538                         break;
539
540                     case '\n':
541                         sb.append("<br>");
542                         i++;
543
544                         break;
545
546                     default:
547                         sb.append(ss.charAt(i));
548                         i++;
549
550                         break;
551                     }
552                 }
553             }
554         } catch (Exception JavaDoc e) {
555             LOG.severe("Error substituting special characters: "
556                     + e.getMessage());
557
558             return null; // error
559
}
560
561         return sb.toString();
562     }
563
564     /**
565      * Tries to fix broken html-strings by inserting
566      * html start- and end tags if missing, and by
567      * removing content after the html end tag.
568      *
569      * @param input html content to be validated
570      * @return content with extra tags inserted if necessary
571      */

572     public static String JavaDoc validateHTMLString(String JavaDoc input) {
573         StringBuffer JavaDoc output = new StringBuffer JavaDoc(input);
574         int index = 0;
575
576         String JavaDoc lowerCaseInput = input.toLowerCase();
577
578         // Check for missing <html> tag
579
if (lowerCaseInput.indexOf("<html>") == -1) {
580             if (lowerCaseInput.indexOf("<!doctype") != -1) {
581                 index = lowerCaseInput.indexOf("\n",
582                         lowerCaseInput.indexOf("<!doctype")) + 1;
583             }
584
585             output.insert(index, "<html>");
586         }
587
588         // Check for missing </html> tag
589
if (lowerCaseInput.indexOf("</html>") == -1) {
590             output.append("</html>");
591         }
592
593         // remove characters after </html> tag
594
index = lowerCaseInput.indexOf("</html>");
595
596         if (lowerCaseInput.length() >= (index + 7)) {
597             lowerCaseInput = lowerCaseInput.substring(0, index + 7);
598         }
599
600         return output.toString();
601     }
602
603     /**
604      * parse text and transform every email-address
605      * in a HTML-conform address
606      *
607      * @param s input text
608      * @return text with email-adresses transformed to links
609      * (null on error)
610      */

611     public static String JavaDoc substituteEmailAddress(String JavaDoc s) {
612         // due to bug CA-174 changed: return EMAIL_PATTERN.matcher(s).replaceAll("<A HREF=\"mailto:$1\">$1</A>");
613
return substituteEmailAddress(s, false);
614     }
615
616     /**
617      * Transforms email-addresses into HTML just as
618      * substituteEmailAddress(String), but tries to ignore email-addresses,
619      * which are already links, if the ignore links flag is set.
620      * <br>
621      * This extended functionality is necessary when parsing a text which
622      * is already (partly) html.
623      * <br>
624      * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
625      *
626      * @param s input text
627      * @param ignoreLinks if true link tags are ignored. This gives a
628      * wrong result if some e-mail adresses are
629      * already links (but uses reg. expr. directly,
630      * and is therefore faster)
631      * @return text with email-adresses transformed to links
632      */

633     public static String JavaDoc substituteEmailAddress(String JavaDoc s, boolean ignoreLinks) {
634         if (ignoreLinks) {
635             // Do not take existing link tags into account
636
return substituteEmailAddress(s);
637         }
638
639         // initialisation
640
Matcher JavaDoc noLinkMatcher = EMAIL_PATTERN.matcher(s);
641         Matcher JavaDoc withLinkMatcher = EMAIL_PATTERN_INC_LINK.matcher(s);
642         int pos = 0; // current position in s
643
int length = s.length();
644         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
645
646         while (pos < length) {
647             if (noLinkMatcher.find(pos)) {
648                 // an email adress was found - check whether its already a link
649
int s1 = noLinkMatcher.start();
650                 int e1 = noLinkMatcher.end();
651                 boolean insertLink;
652
653                 if (withLinkMatcher.find(pos)) {
654                     // found an email address with links - is it the same?
655
int s2 = withLinkMatcher.start();
656                     int e2 = withLinkMatcher.end();
657                     
658                     if ((s2 < s1) && (e2 > e1)) {
659                         // same email adress - just append and continue
660
buf.append(s.substring(pos, e2));
661                         pos = e2;
662                         insertLink = false; // already handled
663
} else {
664                         // not the same
665
insertLink = true;
666                     }
667                 } else {
668                     // no match with mailto link tags
669

670                     insertLink = true;
671                     
672                     // can be an email address in a link BUG CA-174
673
// fix that with looking for an open link in the same line before
674
// on the way from left to the current position of the email at s1
675
// find the last open link <a
676
Matcher JavaDoc openLink = Pattern.compile("<a", Pattern.CASE_INSENSITIVE).matcher(s);
677                     Matcher JavaDoc closeLink = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE).matcher(s);
678                     int linkPos = 0;
679                     int savedLinkPos = -1;
680                     while (linkPos < s1) {
681                         savedLinkPos = linkPos;
682                         if (openLink.find(linkPos))
683                             linkPos = openLink.end();
684                         else
685                             break;
686                     }
687                     
688                     // found an open link
689
if (savedLinkPos > -1) {
690                         // check if it is closed
691
if (closeLink.find(savedLinkPos)) {
692                             // if the closing mark is after the s1 mark do not insert a link
693
if (closeLink.end() >= s1) {
694                                 buf.append(s.substring(pos, e1));
695                                 pos = e1;
696                                 insertLink = false; // already handled
697
}
698                         }
699                     }
700                 }
701
702                 // shall we insert a link?
703
if (insertLink) {
704                     String JavaDoc email = s.substring(s1, e1);
705                     String JavaDoc link = "<a HREF=\"mailto:" + email + "\">" + email
706                             + "</a>";
707                     buf.append(s.substring(pos, s1));
708                     buf.append(link);
709                     pos = e1;
710                 }
711             } else {
712                 // no more matches - append rest of string
713
buf.append(s.substring(pos));
714                 pos = length;
715             }
716         }
717
718         // return result
719
String JavaDoc result = buf.toString();
720         LOG.info("Result:\n" + result);
721
722         return result;
723     }
724
725     /**
726      * parse text and transform every url
727      * in a HTML-conform url
728      *
729      * @param s input text
730      * @return text with urls transformed to links
731      * (null on error)
732      */

733     public static String JavaDoc substituteURL(String JavaDoc s) {
734         String JavaDoc match;
735         Matcher JavaDoc m = URL_PATTERN.matcher(s);
736         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
737
738         int pos = 0;
739         while (m.find()) {
740             match = m.group();
741             
742             sb.append(s.substring(pos, m.start()));
743             String JavaDoc temp = "";
744             // Test if there is a trailing html tag
745
if( match.matches(".*<\\w+$") && s.length() > m.end() && s.charAt(m.end()) == '>') {
746                 temp = match.substring(match.lastIndexOf('<'));
747                 match = match.substring(0,match.lastIndexOf('<'));
748             }
749             sb.append("<A HREF=\"" + match + "\">"+ match + "</A>");
750             sb.append(temp);
751             pos = m.end();
752         }
753
754         sb.append(s.substring(pos));
755         
756         return sb.toString();
757     }
758
759     /**
760      * Transforms urls into HTML just as substituteURL(String),
761      * but tries to ignore urls, which are already links, if the ignore
762      * links flag is set.
763      * <br>
764      * This extended functionality is necessary when parsing a text which
765      * is already (partly) html.
766      * <br>
767      * FIXME: Can this be done smarter, i.e. directly with reg. expr. without manual parsing??
768      *
769      * @param s input text
770      * @param ignoreLinks if true link tags are ignored. This gives a
771      * wrong result if some urls are already links
772      * (but uses reg. expr. directly, and is
773      * therefore faster)
774      * @return text with urls
775      */

776     public static String JavaDoc substituteURL(String JavaDoc s, boolean ignoreLinks) {
777         if (ignoreLinks) {
778             // Do not take existing link tags into account
779
return substituteURL(s);
780         }
781
782         // initialisation
783
Matcher JavaDoc noLinkMatcher = URL_PATTERN.matcher(s);
784         Matcher JavaDoc withLinkMatcher = URL_PATTERN_INC_LINK.matcher(s);
785         int pos = 0; // current position in s
786
int length = s.length();
787         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
788
789         while (pos < length) {
790             if (noLinkMatcher.find(pos)) {
791                 // an url - check whether its already a link
792
int s1 = noLinkMatcher.start();
793                 int e1 = noLinkMatcher.end();
794                 boolean insertLink;
795
796                 if (withLinkMatcher.find(pos)) {
797                     // found an url with links - is it the same?
798
int s2 = withLinkMatcher.start();
799                     int e2 = withLinkMatcher.end();
800
801                     if ((s2 < s1) && (e2 > e1)) {
802                         // same url - just append and continue
803
buf.append(s.substring(pos, e2));
804                         pos = e2;
805                         insertLink = false; // already handled
806
} else {
807                         // not the same
808
insertLink = true;
809                     }
810                 } else {
811                     // no match with link tags
812
insertLink = true;
813                 }
814
815                 // shall we insert a link?
816
if (insertLink) {
817                     String JavaDoc url = s.substring(s1, e1);
818                     String JavaDoc link = "<a HREF=\"" + url + "\">" + url + "</a>";
819                     buf.append(s.substring(pos, s1));
820                     buf.append(link);
821                     pos = e1;
822                 }
823             } else {
824                 // no more matches - append rest of string
825
buf.append(s.substring(pos));
826                 pos = length;
827             }
828         }
829
830         // return result
831
String JavaDoc result = buf.toString();
832         LOG.info("Result:\n" + result);
833
834         return result;
835     }
836
837     /**
838      * Extracts the body of a html document, i.e. the html contents
839      * between (and not including) body start and end tags.
840      *
841      * @param html The html document to extract the body from
842      * @return The body of the html document
843      *
844      * @author Karl Peder Olesen (karlpeder)
845      */

846     public static String JavaDoc getHtmlBody(String JavaDoc html) {
847         // locate body start- and end tags
848
String JavaDoc lowerCaseContent = html.toLowerCase();
849         int tagStart = lowerCaseContent.indexOf("<body");
850
851         // search for closing bracket separately to account for attributes in tag
852
int tagStartClose = lowerCaseContent.indexOf(">", tagStart) + 1;
853         int tagEnd = lowerCaseContent.indexOf("</body>");
854
855         // correct limits if body tags where not found
856
if (tagStartClose < 0) {
857             tagStartClose = 0;
858         }
859
860         if ((tagEnd < 0) || (tagEnd > lowerCaseContent.length())) {
861             tagEnd = lowerCaseContent.length();
862         }
863
864         // return body
865
return html.substring(tagStartClose, tagEnd);
866     }
867
868     /**
869      * Parses a html documents and removes all html comments found.
870      *
871      * @param html The html document
872      * @return Html document without comments
873      *
874      * @author Karl Peder Olesen (karlpeder)
875      */

876     public static String JavaDoc removeComments(String JavaDoc html) {
877         // remove comments
878
return COMMENTS_REMOVAL_PATTERN.matcher(html).replaceAll("");
879     }
880 }
881
Popular Tags