HtmlEncoder


1   /*
2    * Helma License Notice
3    *
4    * The contents of this file are subject to the Helma License
5    * Version 2.0 (the "License"). You may not use this file except in
6    * compliance with the License. A copy of the License is available at
7    * http://adele.helma.org/download/helma/license.txt
8    *
9    * http://adele.helma.org/download/helma/license.txt:
10   *
11  
12   Copyright (c) 1999-2002 Helma Project. All rights reserved.
13  
14   Redistribution and use in source and binary forms, with or without
15   modification, are permitted provided that the following conditions
16   are met:
17  
18   1. Redistributions of source code must retain the above copyright
19      notice, this list of conditions and the following disclaimer.
20  
21   2. Redistributions in binary form must reproduce the above copyright
22      notice, this list of conditions and the following disclaimer in
23      the documentation and/or other materials provided with the
24      distribution.
25  
26   3. Products derived from this software may not be called "Helma"
27      or "Hop", nor may "Helma" or "Hop" appear in their name, without
28      prior written permission of the Helma Project Group. For written 
29      permission, please contact helma@helma.org.
30  
31  
32   THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
33   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
35   DISCLAIMED. IN NO EVENT SHALL THE HELMA PROJECT OR ITS
36   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
37   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
38   NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
41   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
42   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
43   OF THE POSSIBILITY OF SUCH DAMAGE.
44   */
45  
46  package freecs.util;
47  
48  import java.util.*;
49  
50  /**
51   * This is a utility class to encode special characters and do formatting
52   * for HTML output.
53   * @author Hannes Wallnoefer
54   */
55  public final class HtmlEncoder {
56  
57      // transformation table for characters 128 to 255. These actually fall into two
58      // groups, put together for efficiency: "Windows" chacacters 128-159 such as
59      // "smart quotes", which are encoded to valid Unicode entities, and
60      // valid ISO-8859 caracters 160-255, which are encoded to the symbolic HTML
61      // entity. Everything >= 256 is encoded to a numeric entity.
62      //
63      // for mor on HTML entities see http://www.pemberley.com/janeinfo/latin1.html  and
64      // ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
65      //
66      static final String  [] transform =  {
67          "&euro;",   // 128
68          "",           // empty string means character is undefined in unicode
69          "&#8218;",
70          "&#402;",
71          "&#8222;",
72          "&#8230;",
73          "&#8224;",
74          "&#8225;",
75          "&#710;",
76          "&#8240;",
77          "&#352;",
78          "&#8249;",
79          "&#338;",
80          "",
81          "&#381;",
82          "",
83          "",
84          "&#8216;",
85          "&#8217;",
86          "&#8220;",
87          "&#8221;",
88          "&#8226;",
89          "&#8211;",
90          "&#8212;",
91          "&#732;",
92          "&#8482;",
93          "&#353;",
94          "&#8250;",
95          "&#339;",
96          "",
97          "&#382;",
98          "&#376;",  // 159
99          "&nbsp;",    // 160
100         "&iexcl;",
101         "&cent;",
102         "&pound;",
103         "&curren;",
104         "&yen;",
105         "&brvbar;",
106         "&sect;",
107         "&uml;",
108         "&copy;",
109         "&ordf;",
110         "&laquo;",
111         "&not;",
112         "&shy;",
113         "&reg;",
114         "&macr;",
115         "&deg;",
116         "&plusmn;",
117         "&sup2;",
118         "&sup3;",
119         "&acute;",
120         "&micro;",
121         "&para;",
122         "&middot;",
123         "&cedil;",
124         "&sup1;",
125         "&ordm;",
126         "&raquo;",
127         "&frac14;",
128         "&frac12;",
129         "&frac34;",
130         "&iquest;",
131         "&Agrave;",
132         "&Aacute;",
133         "&Acirc;",
134         "&Atilde;",
135         "&Auml;",
136         "&Aring;",
137         "&AElig;",
138         "&Ccedil;",
139         "&Egrave;",
140         "&Eacute;",
141         "&Ecirc;",
142         "&Euml;",
143         "&Igrave;",
144         "&Iacute;",
145         "&Icirc;",
146         "&Iuml;",
147         "&ETH;",
148         "&Ntilde;",
149         "&Ograve;",
150         "&Oacute;",
151         "&Ocirc;",
152         "&Otilde;",
153         "&Ouml;",
154         "&times;",
155         "&Oslash;",
156         "&Ugrave;",
157         "&Uacute;",
158         "&Ucirc;",
159         "&Uuml;",
160         "&Yacute;",
161         "&THORN;",
162         "&szlig;",
163         "&agrave;",
164         "&aacute;",
165         "&acirc;",
166         "&atilde;",
167         "&auml;",
168         "&aring;",
169         "&aelig;",
170         "&ccedil;",
171         "&egrave;",
172         "&eacute;",
173         "&ecirc;",
174         "&euml;",
175         "&igrave;",
176         "&iacute;",
177         "&icirc;",
178         "&iuml;",
179         "&eth;",
180         "&ntilde;",
181         "&ograve;",
182         "&oacute;",
183         "&ocirc;",
184         "&otilde;",
185         "&ouml;",
186         "&divide;",
187         "&oslash;",
188         "&ugrave;",
189         "&uacute;",
190         "&ucirc;",
191         "&uuml;",
192         "&yacute;",
193         "&thorn;",
194         "&yuml;"    // 255
195     };
196 
197     static final HashSet allTags = new HashSet();
198 
199     static {
200         allTags.add("a");
201         allTags.add("abbr");
202         allTags.add("acronym");
203         allTags.add("address");
204         allTags.add("applet");
205         allTags.add("area");
206         allTags.add("b");
207         allTags.add("base");
208         allTags.add("basefont");
209         allTags.add("bdo");
210         allTags.add("bgsound");
211         allTags.add("big");
212         allTags.add("blink");
213         allTags.add("blockquote");
214         allTags.add("bq");
215         allTags.add("body");
216         allTags.add("br");
217         allTags.add("button");
218         allTags.add("caption");
219         allTags.add("center");
220         allTags.add("cite");
221         allTags.add("code");
222         allTags.add("col");
223         allTags.add("colgroup");
224         allTags.add("del");
225         allTags.add("dfn");
226         allTags.add("dir");
227         allTags.add("div");
228         allTags.add("dl");
229         allTags.add("dt");
230         allTags.add("dd");
231         allTags.add("em");
232         allTags.add("embed");
233         allTags.add("fieldset");
234         allTags.add("font");
235         allTags.add("form");
236         allTags.add("frame");
237         allTags.add("frameset");
238         allTags.add("h1");
239         allTags.add("h2");
240         allTags.add("h3");
241         allTags.add("h4");
242         allTags.add("h5");
243         allTags.add("h6");
244         allTags.add("head");
245         allTags.add("html");
246         allTags.add("hr");
247         allTags.add("i");
248         allTags.add("iframe");
249         allTags.add("img");
250         allTags.add("input");
251         allTags.add("ins");
252         allTags.add("isindex");
253         allTags.add("kbd");
254         allTags.add("label");
255         allTags.add("legend");
256         allTags.add("li");
257         allTags.add("link");
258         allTags.add("listing");
259         allTags.add("map");
260         allTags.add("marquee");
261         allTags.add("menu");
262         allTags.add("meta");
263         allTags.add("nobr");
264         allTags.add("noframes");
265         allTags.add("noscript");
266         allTags.add("object");
267         allTags.add("ol");
268         allTags.add("option");
269         allTags.add("optgroup");
270         allTags.add("p");
271         allTags.add("param");
272         allTags.add("plaintext");
273         allTags.add("pre");
274         allTags.add("q");
275         allTags.add("s");
276         allTags.add("samp");
277         allTags.add("script");
278         allTags.add("select");
279         allTags.add("small");
280         allTags.add("span");
281         allTags.add("strike");
282         allTags.add("strong");
283         allTags.add("style");
284         allTags.add("sub");
285         allTags.add("sup");
286         allTags.add("table");
287         allTags.add("tbody");
288         allTags.add("td");
289         allTags.add("textarea");
290         allTags.add("tfoot");
291         allTags.add("th");
292         allTags.add("thead");
293         allTags.add("title");
294         allTags.add("tr");
295         allTags.add("tt");
296         allTags.add("u");
297         allTags.add("ul");
298         allTags.add("var");
299         allTags.add("wbr");
300         allTags.add("xmp");
301     }
302 
303     // HTML block tags need to suppress automatic newline to <br>
304     // conversion around them to look good. However, they differ
305     // in how many newlines around them should ignored. These sets
306     // help to treat each tag right in newline conversion.
307     static final HashSet internalTags = new HashSet();
308     static final HashSet blockTags = new HashSet();
309     static final HashSet semiBlockTags = new HashSet();
310 
311     static {
312         // actual block level elements
313         semiBlockTags.add("address");
314         semiBlockTags.add("dir");
315         semiBlockTags.add("div");
316         semiBlockTags.add("table");
317 
318         blockTags.add("blockquote");
319         blockTags.add("center");
320         blockTags.add("dl");
321         blockTags.add("fieldset");
322         blockTags.add("form");
323         blockTags.add("h1");
324         blockTags.add("h2");
325         blockTags.add("h3");
326         blockTags.add("h4");
327         blockTags.add("h5");
328         blockTags.add("h6");
329         blockTags.add("hr");
330         blockTags.add("isindex");
331         blockTags.add("ol");
332         blockTags.add("p");
333         blockTags.add("pre");
334         blockTags.add("ul");
335 
336         internalTags.add("menu");
337         internalTags.add("noframes");
338         internalTags.add("noscript");
339 
340         /// to be treated as block level elements
341         semiBlockTags.add("th");
342 
343         blockTags.add("br");
344         blockTags.add("dd");
345         blockTags.add("dt");
346         blockTags.add("frameset");
347         blockTags.add("li");
348         blockTags.add("td");
349 
350         internalTags.add("tbody");
351         internalTags.add("tfoot");
352         internalTags.add("thead");
353         internalTags.add("tr");
354     }
355 
356     // set of tags that are always empty
357     static final HashSet emptyTags = new HashSet();
358 
359     static {
360         emptyTags.add("area");
361         emptyTags.add("base");
362         emptyTags.add("basefont");
363         emptyTags.add("br");
364         emptyTags.add("col");
365         emptyTags.add("frame");
366         emptyTags.add("hr");
367         emptyTags.add("img");
368         emptyTags.add("input");
369         emptyTags.add("isindex");
370         emptyTags.add("link");
371         emptyTags.add("meta");
372         emptyTags.add("param");
373     }
374 
375     static final byte TAG_NAME = 0;
376     static final byte TAG_SPACE = 1;
377     static final byte TAG_ATT_NAME = 2;
378     static final byte TAG_ATT_VAL = 3;
379 
380     static final byte TEXT = 0;
381     static final byte SEMIBLOCK = 1;
382     static final byte BLOCK = 2;
383     static final byte INTERNAL = 3;
384 
385     static final String   newLine = System.getProperty("line.separator");
386 
387     /**
388      *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
389      *  Helma macros and HTML comments are passed through unescaped, while
390      *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
391      */
392     public final static String   encode(String   str) {
393         if (str == null) {
394             return null;
395         }
396 
397         int l = str.length();
398 
399         if (l == 0) {
400             return "";
401         }
402 
403         // try to make stringbuffer large enough from the start
404         StringBuffer   ret = new StringBuffer  (Math.round(l * 1.4f));
405 
406         encode(str, ret, false, null);
407 
408         return ret.toString();
409     }
410 
411     /**
412      *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
413      *  Helma macros and HTML comments are passed through unescaped, while
414      *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
415      */
416     public final static void encode(String   str, StringBuffer   ret) {
417         encode(str, ret, false, null);
418     }
419 
420     /**
421      *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
422      *  Helma macros and HTML comments are passed through unescaped, while
423      *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
424      *
425      *  @param str the string to encode
426      *  @param ret the string buffer to encode to
427      *  @param paragraphs if true use p tags for paragraphs, otherwise just use br's
428      *  @param allowedTags a set containing the names of allowed tags as strings. All other
429      *                     tags will be escaped
430      */
431     public final static void encode(String   str, StringBuffer   ret,
432                                     boolean paragraphs, Set allowedTags) {
433         if (str == null) {
434             return;
435         }
436 
437         int l = str.length();
438 
439         // where to insert the <p> tag in case we want to create a paragraph later on
440         int paragraphStart = ret.length();
441 
442         // what kind of element/text are we leaving and entering?
443         // this is one of TEXT|SEMIBLOCK|BLOCK|INTERNAL
444         // depending on this information, we decide whether and how to insert
445         // paragraphs and line breaks. "entering" a tag means we're at the '<'
446         // and exiting means we're at the '>', not that it's a start or close tag.
447         byte entering = TEXT;
448         byte exiting = TEXT;
449 
450         Stack openTags = new Stack();
451 
452         // are we currently within a < and a > that consitute some kind of tag?
453         // we use tag balancing to know whether we are inside a tag (and should
454         // pass things through unchanged) or outside (and should encode stuff).
455         boolean insideTag = false;
456 
457         // are we inside an HTML tag?
458         boolean insideHtmlTag = false;
459         boolean insideCloseTag = false;
460         byte htmlTagMode = TAG_NAME;
461 
462         // if we are inside a <code> tag, we encode everything to make
463         // documentation work easier
464         boolean insideCodeTag = false;
465         boolean insidePreTag = false;
466 
467         // are we within a Helma <% macro %> tag? We treat macro tags and
468         // comments specially, since we can't rely on tag balancing
469         // to know when we leave a macro tag or comment.
470         boolean insideMacroTag = false;
471 
472         // are we inside an HTML comment?
473         boolean insideComment = false;
474 
475         // the quotation mark we are in within an HTML or Macro tag, if any
476         char htmlQuoteChar = '\u0000';
477         char macroQuoteChar = '\u0000';
478 
479         // number of newlines met since the last non-whitespace character
480         int linebreaks = 0;
481 
482         // did we meet a backslash escape?
483         boolean escape = false;
484 
485         boolean triggerBreak = false;
486 
487         for (int i = 0; i < l; i++) {
488             char c = str.charAt(i);
489 
490             // step one: check if this is the beginning of an HTML tag, comment or
491             // Helma macro.
492             if (c == '<') {
493                 if (i < (l - 2)) {
494                     if (!insideMacroTag && ('%' == str.charAt(i + 1))) {
495                         // this is the beginning of a Helma macro tag
496                         if (!insideCodeTag) {
497                             insideMacroTag = insideTag = true;
498                             macroQuoteChar = '\u0000';
499                         }
500                     } else if (('!' == str.charAt(i + 1)) && ('-' == str.charAt(i + 2))) {
501                         // the beginning of an HTML comment?
502                         if (!insideCodeTag) {
503                             insideComment = insideTag = ((i < (l - 3)) &&
504                                                         ('-' == str.charAt(i + 3)));
505                         }
506                     } else if (!insideTag) {
507                         // check if this is a HTML tag.
508                         insideCloseTag = ('/' == str.charAt(i + 1));
509                         int tagStart = insideCloseTag ? (i + 2) : (i + 1);
510                         int j = tagStart;
511 
512                         while ((j < l) && Character.isLetterOrDigit(str.charAt(j)))
513                             j++;
514 
515                         if ((j > tagStart) && (j < l)) {
516                             String   tagName = str.substring(tagStart, j).toLowerCase();
517 
518                             if ("code".equals(tagName) && insideCloseTag &&
519                                     insideCodeTag) {
520                                 insideCodeTag = false;
521                             }
522 
523                             if (((allowedTags == null) || allowedTags.contains(tagName)) &&
524                                     allTags.contains(tagName) && !insideCodeTag) {
525                                 insideHtmlTag = insideTag = true;
526                                 htmlQuoteChar = '\u0000';
527                                 htmlTagMode = TAG_NAME;
528 
529                                 exiting = entering;
530                                 entering = TEXT;
531 
532                                 if (internalTags.contains(tagName)) {
533                                     entering = INTERNAL;
534                                 } else if (blockTags.contains(tagName)) {
535                                     entering = BLOCK;
536                                 } else if (semiBlockTags.contains(tagName)) {
537                                     entering = paragraphs ? BLOCK : SEMIBLOCK;
538                                 }
539 
540                                 if (entering > 0) {
541                                     triggerBreak = !insidePreTag;
542                                 }
543 
544                                 if (insideCloseTag) {
545                                     int t = openTags.search(tagName);
546 
547                                     if (t == -1) {
548                                         i = j;
549                                         insideHtmlTag = insideTag = false;
550 
551                                         continue;
552                                     } else if (t > 1) {
553                                         for (int k = 1; k < t; k++) {
554                                             Object   tag = openTags.pop();
555                                             if (!emptyTags.contains(tag)) {
556                                                 ret.append("</");
557                                                 ret.append(tag);
558                                                 ret.append(">");
559                                             }
560                                         }
561                                     }
562 
563                                     openTags.pop();
564                                 } else {
565                                     openTags.push(tagName);
566                                 }
567 
568                                 if ("code".equals(tagName) && !insideCloseTag) {
569                                     insideCodeTag = true;
570                                 }
571 
572                                 if ("pre".equals(tagName)) {
573                                     insidePreTag = !insideCloseTag;
574                                 }
575                             }
576                         }
577                     }
578                 } // if (i < l-2)
579             }
580 
581             if ((triggerBreak || linebreaks > 0) && !Character.isWhitespace(c)) {
582 
583                 if (!insideTag) {
584                     exiting = entering;
585                     entering = TEXT;
586                     if (exiting >= SEMIBLOCK) {
587                         paragraphStart = ret.length();
588                     }
589                 }
590 
591                 if (entering != INTERNAL && exiting != INTERNAL) {
592                     int swallowBreaks = 0;
593                     if (paragraphs && 
594                           (entering != BLOCK || exiting != BLOCK) &&
595                           (exiting < BLOCK) &&
596                           (linebreaks > 1) &&
597                           paragraphStart < ret.length()) {
598                         ret.insert(paragraphStart, "<p>");
599                         ret.append("</p>");
600                         swallowBreaks = 2;
601                     }
602 
603                     // treat entering a SEMIBLOCK as entering a TEXT 
604                     int _entering = entering == SEMIBLOCK ? TEXT : entering;
605                     for (int k = linebreaks-1; k>=0; k--) {
606                         if (k >= swallowBreaks && k >= _entering && k >= exiting) {
607                             ret.append("<br />");
608                         }
609                         ret.append(newLine);
610                     }
611                     if (exiting >= SEMIBLOCK || linebreaks > 1) {
612                         paragraphStart = ret.length();
613                     }
614 
615                 }
616 
617                 linebreaks = 0;
618                 triggerBreak = false;
619             }
620 
621             switch (c) {
622                 case '<':
623 
624                     if (insideTag) {
625                         ret.append('<');
626                     } else {
627                         ret.append("&lt;");
628                     }
629 
630                     break;
631 
632                 case '&':
633 
634                     // check if this is an HTML entity already,
635                     // in which case we pass it though unchanged
636                     if ((i < (l - 3)) && !insideCodeTag) {
637                         // is this a numeric entity?
638                         if (str.charAt(i + 1) == '#') {
639                             int j = i + 2;
640 
641                             while ((j < l) && Character.isDigit(str.charAt(j)))
642                                 j++;
643 
644                             if ((j < l) && (str.charAt(j) == ';')) {
645                                 ret.append("&");
646 
647                                 break;
648                             }
649                         } else {
650                             int j = i + 1;
651 
652                             while ((j < l) && Character.isLetterOrDigit(str.charAt(j)))
653                                 j++;
654 
655                             if ((j < l) && (str.charAt(j) == ';')) {
656                                 ret.append("&");
657 
658                                 break;
659                             }
660                         }
661                     }
662 
663                     // we didn't reach a break, so encode the ampersand as HTML entity
664                     ret.append("&amp;");
665 
666                     break;
667 
668                 case '\\':
669                     ret.append(c);
670 
671                     if (insideTag && !insideComment) {
672                         escape = !escape;
673                     }
674 
675                     break;
676 
677                 case '"':
678                 case '\'':
679                     ret.append(c);
680 
681                     if (!insideComment) {
682                         // check if the quote is escaped
683                         if (insideMacroTag) {
684                             if (escape) {
685                                 escape = false;
686                             } else if (macroQuoteChar == c) {
687                                 macroQuoteChar = '\u0000';
688                             } else if (macroQuoteChar == '\u0000') {
689                                 macroQuoteChar = c;
690                             }
691                         } else if (insideHtmlTag) {
692                             if (escape) {
693                                 escape = false;
694                             } else if (htmlQuoteChar == c) {
695                                 htmlQuoteChar = '\u0000';
696                                 htmlTagMode = TAG_SPACE;
697                             } else if (htmlQuoteChar == '\u0000') {
698                                 htmlQuoteChar = c;
699                             }
700                         }
701                     }
702 
703                     break;
704 
705                 case '\n':
706                     if (insideTag || insidePreTag) {
707                         ret.append('\n');
708                     } else {
709                         linebreaks++;
710                     }
711 
712                     break;
713                 case '\r':
714                     if (insideTag || insidePreTag) {
715                         ret.append('\r');
716                     }
717                     break;
718 
719                 case '>':
720 
721                     // For Helma macro tags and comments, we overrule tag balancing,
722                     // i.e. we don't require that '<' and '>' be balanced within
723                     // macros and comments. Rather, we check for the matching closing tag.
724                     if (insideComment) {
725                         ret.append('>');
726                         insideComment = !((str.charAt(i - 2) == '-') &&
727                                         (str.charAt(i - 1) == '-'));
728                     } else if (insideMacroTag) {
729                         ret.append('>');
730                         insideMacroTag = !((str.charAt(i - 1) == '%') &&
731                                          (macroQuoteChar == '\u0000'));
732                     } else if (insideHtmlTag) {
733                         ret.append('>');
734 
735                         // only leave HTML tag if quotation marks are balanced
736                         // within that tag.
737                         insideHtmlTag = htmlQuoteChar != '\u0000';
738 
739                         // Check if this is an empty tag so we don't generate an
740                         // additional </close> tag.
741                         if (str.charAt(i - 1) == '/') {
742                             // this is to avoid misinterpreting tags like
743                             // <a HREF=http://foo/> as empty
744                             if (htmlTagMode != TAG_ATT_VAL && htmlTagMode != TAG_ATT_NAME) {
745                                 openTags.pop();
746                             }
747                         }
748 
749                         exiting = entering;
750                         if (exiting > 0) {
751                            triggerBreak = !insidePreTag;
752                         }
753 
754                     } else {
755                         ret.append("&gt;");
756                     }
757 
758                     // check if we still are inside any kind of tag
759                     insideTag = insideComment || insideMacroTag || insideHtmlTag;
760                     insideCloseTag = insideTag;
761 
762                     break;
763 
764                 default:
765 
766                     if (insideHtmlTag && !insideCloseTag) {
767                         switch(htmlTagMode) {
768                             case TAG_NAME:
769                                 if (!Character.isLetterOrDigit(c)) {
770                                     htmlTagMode = TAG_SPACE;
771                                 }
772                                 break;
773                             case TAG_SPACE:
774                                 if (Character.isLetterOrDigit(c)) {
775                                     htmlTagMode = TAG_ATT_NAME;
776                                 }
777                                 break;
778                             case TAG_ATT_NAME:
779                                 if (c == '=') {
780                                     htmlTagMode = TAG_ATT_VAL;
781                                 } else if (c == ' ') {
782                                     htmlTagMode = TAG_SPACE;
783                                 }
784                                 break;
785                             case TAG_ATT_VAL:
786                                 if (Character.isWhitespace(c) && htmlQuoteChar == '\u0000') {
787                                     htmlTagMode = TAG_SPACE;
788                                 }
789                                 break;
790                         }
791                     }
792                     if (c < 128) {
793                         ret.append(c);
794                     } else if ((c >= 128) && (c < 256)) {
795                         ret.append(transform[c - 128]);
796                     } else {
797                         ret.append("&#");
798                         ret.append((int) c);
799                         ret.append(";");
800                     }
801 
802                     escape = false;
803             }
804         }
805 
806         // if tags were opened but not closed, close them.
807         int o = openTags.size();
808 
809         if (o > 0) {
810             for (int k = 0; k < o; k++) {
811                 Object   tag = openTags.pop();
812                 if (!emptyTags.contains(tag)) {
813                     ret.append("</");
814                     ret.append(tag);
815                     ret.append(">");
816                 }
817             }
818         }
819 
820         // add remaining newlines we may have collected
821         int swallowBreaks = 0;
822         if (paragraphs && entering < BLOCK) {
823             ret.insert(paragraphStart, "<p>");
824             ret.append("</p>");
825             swallowBreaks = 2;
826         }
827 
828         if (linebreaks > 0) {
829             for (int i = linebreaks-1; i>=0; i--) {
830                 if (i >= swallowBreaks && i > exiting) {
831                     ret.append("<br />");
832                 }
833                 ret.append(newLine);
834             }
835         }
836     }
837 
838     /**
839      *
840      */
841     public final static String   encodeFormValue(String   str) {
842         if (str == null) {
843             return null;
844         }
845 
846         int l = str.length();
847 
848         if (l == 0) {
849             return "";
850         }
851 
852         StringBuffer   ret = new StringBuffer  (Math.round(l * 1.2f));
853 
854         encodeAll(str, ret, false);
855 
856         return ret.toString();
857     }
858 
859     /**
860      *
861      */
862     public final static void encodeFormValue(String   str, StringBuffer   ret) {
863         encodeAll(str, ret, false);
864     }
865 
866     /**
867      *
868      */
869     public final static String   encodeAll(String   str) {
870         if (str == null) {
871             return null;
872         }
873 
874         int l = str.length();
875 
876         if (l == 0) {
877             return "";
878         }
879 
880         StringBuffer   ret = new StringBuffer  (Math.round(l * 1.2f));
881 
882         encodeAll(str, ret, true);
883 
884         return ret.toString();
885     }
886 
887     /**
888      *
889      */
890     public final static void encodeAll(String   str, StringBuffer   ret) {
891         encodeAll(str, ret, true);
892     }
893 
894     /**
895      *
896      */
897     public final static void encodeAll(String   str, StringBuffer   ret, boolean encodeNewline) {
898         if (str == null) {
899             return;
900         }
901 
902         int l = str.length();
903 
904         for (int i = 0; i < l; i++) {
905             char c = str.charAt(i);
906 
907             switch (c) {
908                 case '<':
909                     ret.append("&lt;");
910 
911                     break;
912 
913                 case '>':
914                     ret.append("&gt;");
915 
916                     break;
917 
918                 case '&':
919                     ret.append("&amp;");
920 
921                     break;
922 
923                 case '"':
924                     ret.append("&quot;");
925 
926                     break;
927 
928                 case '\n':
929                     if (encodeNewline) {
930                         ret.append("<br />");
931                     }
932                     ret.append('\n');
933 
934                     break;
935 
936                 default:
937 
938                     // ret.append (c);
939                     if (c < 128) {
940                         ret.append(c);
941                     } else if ((c >= 128) && (c < 256)) {
942                         ret.append(transform[c - 128]);
943                     } else {
944                         ret.append("&#");
945                         ret.append((int) c);
946                         ret.append(";");
947                     }
948             }
949         }
950     }
951 
952     /**
953      *
954      *
955      * @param str ...
956      *
957      * @return ...
958      */
959     public final static String   encodeXml(String   str) {
960         if (str == null) {
961             return null;
962         }
963 
964         int l = str.length();
965 
966         if (l == 0) {
967             return "";
968         }
969 
970         StringBuffer   ret = new StringBuffer  (Math.round(l * 1.2f));
971 
972         encodeXml(str, ret);
973 
974         return ret.toString();
975     }
976 
977     /**
978      *
979      *
980      * @param str ...
981      * @param ret ...
982      */
983     public final static void encodeXml(String   str, StringBuffer   ret) {
984         if (str == null) {
985             return;
986         }
987 
988         int l = str.length();
989 
990         for (int i = 0; i < l; i++) {
991             char c = str.charAt(i);
992 
993             switch (c) {
994                 case '<':
995                     ret.append("&lt;");
996 
997                     break;
998 
999                 case '>':
1000                    ret.append("&gt;");
1001
1002                    break;
1003
1004                case '&':
1005                    ret.append("&amp;");
1006
1007                    break;
1008
1009                case '"':
1010                    ret.append("&quot;");
1011
1012                    break;
1013
1014                case '\'':
1015                    ret.append("&apos;");
1016
1017                    break;
1018
1019                default:
1020
1021                    if (c < 0x20) {
1022                        // sort out invalid XML characters below 0x20 - all but 0x9, 0xA and 0xD.
1023                        // The trick is an adaption of java.lang.Character.isSpace().
1024                        if (((((1L << 0x9) | (1L << 0xA) | (1L << 0xD)) >> c) & 1L) != 0) {
1025                            ret.append(c);
1026                        }
1027                    } else {
1028                        ret.append(c);
1029                    }
1030            }
1031        }
1032    }
1033
1034    // test method
1035    public static String   printCharRange(int from, int to) {
1036        StringBuffer   response = new StringBuffer  ();
1037
1038        for (int i = from; i < to; i++) {
1039            response.append(i);
1040            response.append("      ");
1041            response.append((char) i);
1042            response.append("      ");
1043
1044            if (i < 128) {
1045                response.append((char) i);
1046            } else if ((i >= 128) && (i < 256)) {
1047                response.append(transform[i - 128]);
1048            } else {
1049                response.append("&#");
1050                response.append(i);
1051                response.append(";");
1052            }
1053
1054            response.append("\r\n");
1055        }
1056
1057        return response.toString();
1058    }
1059
1060    // for testing...
1061    public static void main(String  [] args) {
1062        for (int i = 0; i < args.length; i++)
1063            System.err.println(encode(args[i]));
1064    }
1065}
1066 // end of class
1067
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags