CharacterReference


1   // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2   // Version 2.2
3   // Copyright (C) 2006 Martin Jericho
4   // http://sourceforge.net/projects/jerichohtml/
5   //
6   // This library is free software; you can redistribute it and/or
7   // modify it under the terms of the GNU Lesser General Public
8   // License as published by the Free Software Foundation; either
9   // version 2.1 of the License, or (at your option) any later version.
10  // http://www.gnu.org/copyleft/lesser.html
11  //
12  // This library is distributed in the hope that it will be useful,
13  // but WITHOUT ANY WARRANTY; without even the implied warranty of
14  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  // Lesser General Public License for more details.
16  //
17  // You should have received a copy of the GNU Lesser General Public
18  // License along with this library; if not, write to the Free Software
19  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  
21  package au.id.jericho.lib.html;
22  
23  import java.util.*;
24  
25  /**
26   * Represents an HTML <a target="_blank" HREF="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>,
27   * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
28   * <p>
29   * This class, together with its subclasses, contains static methods to perform most required operations
30   * without having to instantiate an object.
31   * <p>
32   * Instances of this class are useful when the positions of character references in a source document are required,
33   * or to replace the found character references with customised text.
34   * <p>
35   * <code>CharacterReference</code> instances are obtained using one of the following methods:
36   * <ul>
37   *  <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
38   *  <li>{@link Source#findNextCharacterReference(int pos)}
39   *  <li>{@link Source#findPreviousCharacterReference(int pos)}
40   *  <li>{@link Segment#findAllCharacterReferences()}
41   * </ul>
42   */
43  public abstract class CharacterReference extends Segment {
44      int codePoint;
45  
46      /**
47       * Represents an invalid unicode code point.
48       * <p>
49       * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
50       */
51      public static final int INVALID_CODE_POINT=-1;
52  
53      /**
54       * The maximum codepoint allowed by unicode, 0x10FFFF (decimal 1114111).
55       * This can be replaced by Character.MAX_CODE_POINT in java 1.5
56       */
57      static final int MAX_CODE_POINT=0x10FFFF;
58  
59      /** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
60      private static final int TAB_LENGTH=4;
61  
62      CharacterReference(final Source source, final int begin, final int end, final int codePoint) {
63          super(source,begin,end);
64          this.codePoint=codePoint;
65      }
66  
67      /**
68       * Returns the <a target="_blank" HREF="http://www.unicode.org">unicode</a> code point represented by this character reference.
69       * @return the unicode code point represented by this character reference.
70       */
71      public int getCodePoint() {
72          return codePoint;
73      }
74  
75      /**
76       * Returns the character represented by this character reference.
77       * <p>
78       * If this character reference represents a unicode
79       * <a target="_blank" HREF="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>,
80       * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
81       *
82       * @return the character represented by this character reference.
83       */
84      public char getChar() {
85          return (char)codePoint;
86      }
87  
88      /**
89       * Indicates whether this character reference is terminated by a semicolon (<code>;</code>).
90       * <p>
91       * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does
92       * not end with a semicolon.
93       * <p>
94       * The SGML specification allows unterminated character references in some circumstances, and because the
95       * HTML 4.01 specification states simply that
96       * "<a target="_blank" HREF="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>",
97       * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
98       * <p>
99       * Unterminated character references are not allowed in <a target="_blank" HREF="http://www.w3.org/TR/xhtml1/">XHTML</a> documents.
100      *
101      * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>.
102      * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
103      */
104     public boolean isTerminated() {
105         return source.charAt(end-1)==';';
106     }
107 
108     /**
109      * Encodes the specified text, escaping special characters into character references.
110      * <p>
111      * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character,
112      * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
113      * code point is greater than U+007F.
114      * <p>
115      * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
116      * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
117      * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&amp;#39;</code>".
118      * <p>
119      * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &amp;apos;}
120      * as this entity is not defined for use in HTML.  See the comments in the {@link CharacterEntityReference} class for more information.
121      * <p>
122      * To encode text using only numeric character references, use the<br />
123      * {@link NumericCharacterReference#encode(CharSequence)} method instead.
124      *
125      * @param unencodedText  the text to encode.
126      * @return the encoded string.
127      * @see #decode(CharSequence)
128      */
129     public static String   encode(final CharSequence   unencodedText) {
130         if (unencodedText==null) return null;
131         return appendEncode(new StringBuffer  (unencodedText.length()*2),unencodedText,false).toString();
132     }
133 
134     /**
135      * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
136      * <p>
137      * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
138      * <ul>
139      *  <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
140      *   are converted to "<code>&lt;br /&gt;</code>".  CR/LF pairs are treated as a single line break.
141      *  <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&amp;nbsp;</code>"
142      *   while ensuring the last is always a normal space.
143      *  <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
144      * </ul>
145      * <p>
146      * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
147      * spaces to be rendered, but also allows the line to wrap in the middle of it.
148      * <p>
149      * Note that zero-width spaces (U+200B) are converted to the numeric character reference
150      * "<code>&amp;#x200B;</code>" through the normal encoding process, but IE6 does not render them properly
151      * either encoded or unencoded.
152      * <p>
153      * There is no method provided to reverse this encoding.
154      *
155      * @param unencodedText  the text to encode.
156      * @return the encoded string with whitespace formatting converted to markup.
157      * @see #encode(CharSequence)
158      */
159     public static String   encodeWithWhiteSpaceFormatting(final CharSequence   unencodedText) {
160         if (unencodedText==null) return null;
161         return appendEncode(new StringBuffer  (unencodedText.length()*2),unencodedText,true).toString();
162     }
163 
164     /**
165      * Decodes the specified HTML encoded text into normal text.
166      * <p>
167      * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
168      * are converted to their respective characters.
169      * <p>
170      * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
171      * <p>
172      * <a HREF="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
173      * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
174      * <p>
175      * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
176      * some browsers also recognise them in a case-insensitive way.
177      * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
178      *
179      * @param encodedText  the text to decode.
180      * @return the decoded string.
181      * @see #encode(CharSequence)
182      */
183     public static String   decode(final CharSequence   encodedText) {
184         return decode(encodedText,false);
185     }
186 
187     /**
188      * Decodes the specified HTML encoded text into normal text.
189      * <p>
190      * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
191      * are converted to their respective characters.
192      * <p>
193      * <a HREF="#Unterminated">Unterminated</a> character references are dealt with according to the
194      * value of the <code>insideAttributeValue</code> parameter and the
195      * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
196      * <p>
197      * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
198      * some browsers also recognise them in a case-insensitive way.
199      * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
200      *
201      * @param encodedText  the text to decode.
202      * @param insideAttributeValue  specifies whether the encoded text is inside an attribute value.
203      * @return the decoded string.
204      * @see #decode(CharSequence)
205      * @see #encode(CharSequence)
206      */
207     public static String   decode(final CharSequence   encodedText, final boolean insideAttributeValue) {
208         if (encodedText==null) return null;
209         final String   encodedString=encodedText.toString();
210         final int pos=encodedString.indexOf('&');
211         if (pos==-1) return encodedString;
212         return appendDecode(new StringBuffer  (encodedString.length()),encodedString,pos,insideAttributeValue).toString();
213     }
214 
215     /**
216      * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
217      * <p>
218      * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
219      * <p>
220      * The result is how the text would normally be rendered by a
221      * <a target="_blank" HREF="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
222      * assuming it does not contain any tags.
223      *
224      * @param text  the source text
225      * @return the decoded text with collapsed white space.
226      * @see FormControl#getPredefinedValues()
227      */
228     public static String   decodeCollapseWhiteSpace(final CharSequence   text) {
229         return decode(appendCollapseWhiteSpace(new StringBuffer  (text.length()),text));
230     }
231 
232     /**
233      * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
234      * <p>
235      * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
236      * <p>
237      * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
238      * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
239      * may be used in future.
240      *
241      * @param encodedText  the text to re-encode.
242      * @return the re-encoded string.
243      */
244     public static String   reencode(final CharSequence   encodedText) {
245         return encode(decode(encodedText,true));
246     }
247 
248     /**
249      * Returns the encoded form of this character reference.
250      * <p>
251      * The exact behaviour of this method depends on the class of this object.
252      * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
253      * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
254      * <p>
255      * <dl>
256      *  <dt>Examples:</dt>
257      *   <dd><code>CharacterReference.parse("&amp;GT;").getCharacterReferenceString()</code> returns "<code>&amp;gt;</code>"</dd>
258      *   <dd><code>CharacterReference.parse("&amp;#x3E;").getCharacterReferenceString()</code> returns "<code>&amp;#3e;</code>"</dd>
259      * </dl>
260      *
261      * @return the encoded form of this character reference.
262      * @see #getCharacterReferenceString(int codePoint)
263      * @see #getDecimalCharacterReferenceString()
264      */
265     public abstract String   getCharacterReferenceString();
266 
267     /**
268      * Returns the encoded form of the specified unicode code point.
269      * <p>
270      * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
271      * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
272      * <p>
273      * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
274      * which is encoded as the numeric character reference "<code>&amp;#39;</code>" instead of its character entity reference
275      * "<code>&amp;apos;</code>".
276      * <p>
277      * <dl>
278      *  <dt>Examples:</dt>
279      *   <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&amp;gt;</code>"</dd>
280      *   <dd><code>CharacterReference.getCharacterReferenceString('&gt;')</code> returns "<code>&amp;gt;</code>"</dd>
281      *   <dd><code>CharacterReference.getCharacterReferenceString('&#9786;')</code> returns "<code>&amp;#9786;</code>"</dd>
282      * </dl>
283      *
284      * @param codePoint  the unicode code point to encode.
285      * @return the encoded form of the specified unicode code point.
286      * @see #getHexadecimalCharacterReferenceString(int codePoint)
287      */
288     public static String   getCharacterReferenceString(final int codePoint) {
289         String   characterReferenceString=null;
290         if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
291         if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
292         return characterReferenceString;
293     }
294 
295     /**
296      * Returns the <a HREF="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
297      * <p>
298      * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
299      * <p>
300      * <dl>
301      *  <dt>Example:</dt>
302      *  <dd><code>CharacterReference.parse("&amp;gt;").getDecimalCharacterReferenceString()</code> returns "<code>&amp;#62;</code>"</dd>
303      * </dl>
304      *
305      * @return the decimal encoded form of this character reference.
306      * @see #getCharacterReferenceString()
307      * @see #getHexadecimalCharacterReferenceString()
308      */
309     public String   getDecimalCharacterReferenceString() {
310         return getDecimalCharacterReferenceString(codePoint);
311     }
312 
313     /**
314      * Returns the <a HREF="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
315      * <p>
316      * <dl>
317      *  <dt>Example:</dt>
318      *  <dd><code>CharacterReference.getDecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
319      * </dl>
320      *
321      * @param codePoint  the unicode code point to encode.
322      * @return the decimal encoded form of the specified unicode code point.
323      * @see #getCharacterReferenceString(int codePoint)
324      * @see #getHexadecimalCharacterReferenceString(int codePoint)
325      */
326     public static String   getDecimalCharacterReferenceString(final int codePoint) {
327         return appendDecimalCharacterReferenceString(new StringBuffer  (),codePoint).toString();
328     }
329 
330     /**
331      * Returns the <a HREF="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
332      * <p>
333      * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
334      * <p>
335      * <dl>
336      *  <dt>Example:</dt>
337      *  <dd><code>CharacterReference.parse("&amp;gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&amp;#x3e;</code>"</dd>
338      * </dl>
339      *
340      * @return the hexadecimal encoded form of this character reference.
341      * @see #getCharacterReferenceString()
342      * @see #getDecimalCharacterReferenceString()
343      */
344     public String   getHexadecimalCharacterReferenceString() {
345         return getHexadecimalCharacterReferenceString(codePoint);
346     }
347 
348     /**
349      * Returns the <a HREF="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
350      * <p>
351      * <dl>
352      *  <dt>Example:</dt>
353      *  <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#x3e;</code>"</dd>
354      * </dl>
355      *
356      * @param codePoint  the unicode code point to encode.
357      * @return the hexadecimal encoded form of the specified unicode code point.
358      * @see #getCharacterReferenceString(int codePoint)
359      * @see #getDecimalCharacterReferenceString(int codePoint)
360      */
361     public static String   getHexadecimalCharacterReferenceString(final int codePoint) {
362         return appendHexadecimalCharacterReferenceString(new StringBuffer  (),codePoint).toString();
363     }
364 
365     /**
366      * Returns the unicode code point of this character reference in <a target="_blank" HREF="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
367      * <p>
368      * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
369      * <p>
370      * <dl>
371      *  <dt>Example:</dt>
372      *  <dd><code>CharacterReference.parse("&amp;gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
373      * </dl>
374      *
375      * @return the unicode code point of this character reference in U+ notation.
376      * @see #getUnicodeText(int codePoint)
377      */
378     public String   getUnicodeText() {
379         return getUnicodeText(codePoint);
380     }
381 
382     /**
383      * Returns the specified unicode code point in <a target="_blank" HREF="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
384      * <p>
385      * <dl>
386      *  <dt>Example:</dt>
387      *  <dd><code>CharacterReference.getUnicodeText('&gt;')</code> returns "<code>U+003E</code>"</dd>
388      * </dl>
389      *
390      * @param codePoint  the unicode code point.
391      * @return the specified unicode code point in U+ notation.
392      */
393     public static String   getUnicodeText(final int codePoint) {
394         return appendUnicodeText(new StringBuffer  (),codePoint).toString();
395     }
396 
397     static final StringBuffer   appendUnicodeText(final StringBuffer   sb, final int codePoint) {
398         sb.append("U+");
399         final String   hex=Integer.toString(codePoint,16).toUpperCase();
400         for (int i=4-hex.length(); i>0; i--) sb.append('0');
401         sb.append(hex);
402         return sb;
403     }
404 
405     /**
406      * Parses a single encoded character reference text into a <code>CharacterReference</code> object.
407      * <p>
408      * The character reference must be at the start of the given text, but may contain other characters at the end.
409      * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
410      * <p>
411      * If the text does not represent a valid character reference, this method returns <code>null</code>.
412      * <p>
413      * <a HREF="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
414      * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
415      * <p>
416      * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
417      * <p>
418      * <dl>
419      *  <dt>Example:</dt>
420      *  <dd><code>CharacterReference.parse("&amp;gt;").getChar()</code> returns '<code>&gt;</code>'</dd>
421      * </dl>
422      *
423      * @param characterReferenceText  the text containing a single encoded character reference.
424      * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
425      * @see #decode(CharSequence)
426      */
427     public static CharacterReference parse(final CharSequence   characterReferenceText) {
428         return construct(new Source(characterReferenceText.toString()),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
429     }
430 
431     /**
432      * Parses a single encoded character reference text into a unicode code point.
433      * <p>
434      * The character reference must be at the start of the given text, but may contain other characters at the end.
435      * <p>
436      * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
437      * <p>
438      * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()},
439      * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
440      * <code>NullPointerException</code>.
441      * <p>
442      * <dl>
443      *  <dt>Example:</dt>
444      *  <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&amp;gt;")</code> returns <code>38</code></dd>
445      * </dl>
446      *
447      * @param characterReferenceText  the text containing a single encoded character reference.
448      * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
449      */
450     public static int getCodePointFromCharacterReferenceString(final CharSequence   characterReferenceText) {
451         final CharacterReference characterReference=parse(characterReferenceText);
452         return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT;
453     }
454 
455     /**
456      * Indicates whether the specified character would need to be encoded in HTML text.
457      * <p>
458      * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
459      * <p>
460      * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
461      * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property
462      * is currently set to <code>true</code>.
463      *
464      * @param ch  the character to test.
465      * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>.
466      */
467     public static final boolean requiresEncoding(final char ch) {
468         return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded));
469     }
470 
471     static StringBuffer   appendEncode(final StringBuffer   sb, CharSequence   unencodedText, final boolean whiteSpaceFormatting) {
472         if (unencodedText==null) return sb;
473         int beginPos=0;
474         int endPos=unencodedText.length();
475         if (unencodedText instanceof Segment) {
476             // this might improve performance slightly
477             final Segment segment=(Segment)unencodedText;
478             final int segmentOffset=segment.getBegin();
479             beginPos=segmentOffset;
480             endPos+=segmentOffset;
481             unencodedText=segment.source.string;
482         }
483         final boolean isApostropheEncoded=Config.IsApostropheEncoded;
484         for (int i=beginPos; i<endPos; i++) {
485             char ch=unencodedText.charAt(i);
486             final String   characterEntityReferenceName=CharacterEntityReference.getName(ch);
487             if (characterEntityReferenceName!=null) {
488                 if (ch=='\'') {
489                     if (isApostropheEncoded)
490                         sb.append("&#39;");
491                     else
492                         sb.append(ch);
493                 } else {
494                     CharacterEntityReference.appendCharacterReferenceString(sb,characterEntityReferenceName);
495                 }
496             } else if (ch>127) {
497                 appendDecimalCharacterReferenceString(sb,ch);
498             } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
499                 sb.append(ch);
500             } else {
501                 // whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup
502                 int spaceCount;
503                 int nexti=i+1;
504                 if (ch!=' ') {
505                     if (ch!='\t') {
506                         // must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string
507                         if (ch=='\r' && nexti<endPos && unencodedText.charAt(nexti)=='\n') i++; // process cr/lf pair as one line break
508                         sb.append("<br />"); // add line break
509                         continue;
510                     } else {
511                         spaceCount=TAB_LENGTH;
512                     }
513                 } else {
514                     spaceCount=1;
515                 }
516                 while (nexti<endPos) {
517                     ch=unencodedText.charAt(nexti);
518                     if (ch==' ')
519                         spaceCount+=1;
520                     else if (ch=='\t')
521                         spaceCount+=TAB_LENGTH;
522                     else
523                         break;
524                     nexti++;
525                 }
526                 if (spaceCount==1) {
527                     // handle the very common case of a single character to improve efficiency slightly
528                     sb.append(' ');
529                     continue;
530                 }
531                 if (spaceCount%2==1) sb.append(' '); // fist character is a space if we have an odd number of spaces
532                 while (spaceCount>=2) {
533                     sb.append("&nbsp; "); // use alternating &nbsp; and spaces to keep original number of spaces
534                     spaceCount-=2;
535                 }
536                 // note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
537                 i=nexti-1; // minus 1 because top level for loop will add it again
538             }
539         }
540         return sb;
541     }
542 
543     static CharacterReference findPreviousOrNext(final Source source, final int pos, final boolean previous) {
544         return findPreviousOrNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL,previous);
545     }
546 
547     private static CharacterReference findPreviousOrNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings, final boolean previous) {
548         final ParseText parseText=source.getParseText();
549         pos=previous ? parseText.lastIndexOf('&',pos) : parseText.indexOf('&',pos);
550         while (pos!=-1) {
551             final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
552             if (characterReference!=null) return characterReference;
553             pos=previous ? parseText.lastIndexOf('&',pos-1) : parseText.indexOf('&',pos+1);
554         }
555         return null;
556     }
557 
558     static final StringBuffer   appendHexadecimalCharacterReferenceString(final StringBuffer   sb, final int codePoint) {
559         return sb.append("&#x").append(Integer.toString(codePoint,16)).append(';');
560     }
561 
562     static final StringBuffer   appendDecimalCharacterReferenceString(final StringBuffer   sb, final int codePoint) {
563         return sb.append("&#").append(codePoint).append(';');
564     }
565 
566     private static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
567         try {
568             if (source.getParseText().charAt(begin)!='&') return null;
569             return (source.getParseText().charAt(begin+1)=='#')
570                 ? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings)
571                 : CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
572         } catch (IndexOutOfBoundsException   ex) {
573             return null;
574         }
575     }
576 
577     private static StringBuffer   appendDecode(final StringBuffer   sb, final String   encodedString, int pos, final boolean insideAttributeValue) {
578         final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
579         int lastEnd=0;
580         final Source source=new Source(encodedString);
581         while (true) {
582             final CharacterReference characterReference=findPreviousOrNext(source,pos,unterminatedCharacterReferenceSettings,false);
583             if (characterReference==null) break;
584             if (lastEnd!=characterReference.getBegin()) Util.appendTo(sb,encodedString,lastEnd,characterReference.getBegin());
585             sb.append((char)characterReference.codePoint);
586             pos=lastEnd=characterReference.getEnd();
587         }
588         if (lastEnd!=encodedString.length()) Util.appendTo(sb,encodedString,lastEnd,encodedString.length());
589         return sb;
590     }
591 }
592
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags