KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > au > id > jericho > lib > html > CharacterReference


1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2
// Version 2.2
3
// Copyright (C) 2006 Martin Jericho
4
// http://sourceforge.net/projects/jerichohtml/
5
//
6
// This library is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 2.1 of the License, or (at your option) any later version.
10
// http://www.gnu.org/copyleft/lesser.html
11
//
12
// This library is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
// Lesser General Public License for more details.
16
//
17
// You should have received a copy of the GNU Lesser General Public
18
// License along with this library; if not, write to the Free Software
19
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20

21 package au.id.jericho.lib.html;
22
23 import java.util.*;
24
25 /**
26  * Represents an HTML <a target="_blank" HREF="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>,
27  * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
28  * <p>
29  * This class, together with its subclasses, contains static methods to perform most required operations
30  * without having to instantiate an object.
31  * <p>
32  * Instances of this class are useful when the positions of character references in a source document are required,
33  * or to replace the found character references with customised text.
34  * <p>
35  * <code>CharacterReference</code> instances are obtained using one of the following methods:
36  * <ul>
37  * <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
38  * <li>{@link Source#findNextCharacterReference(int pos)}
39  * <li>{@link Source#findPreviousCharacterReference(int pos)}
40  * <li>{@link Segment#findAllCharacterReferences()}
41  * </ul>
42  */

43 public abstract class CharacterReference extends Segment {
44     int codePoint;
45
46     /**
47      * Represents an invalid unicode code point.
48      * <p>
49      * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
50      */

51     public static final int INVALID_CODE_POINT=-1;
52
53     /**
54      * The maximum codepoint allowed by unicode, 0x10FFFF (decimal 1114111).
55      * This can be replaced by Character.MAX_CODE_POINT in java 1.5
56      */

57     static final int MAX_CODE_POINT=0x10FFFF;
58
59     /** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
60     private static final int TAB_LENGTH=4;
61
62     CharacterReference(final Source source, final int begin, final int end, final int codePoint) {
63         super(source,begin,end);
64         this.codePoint=codePoint;
65     }
66
67     /**
68      * Returns the <a target="_blank" HREF="http://www.unicode.org">unicode</a> code point represented by this character reference.
69      * @return the unicode code point represented by this character reference.
70      */

71     public int getCodePoint() {
72         return codePoint;
73     }
74
75     /**
76      * Returns the character represented by this character reference.
77      * <p>
78      * If this character reference represents a unicode
79      * <a target="_blank" HREF="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>,
80      * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
81      *
82      * @return the character represented by this character reference.
83      */

84     public char getChar() {
85         return (char)codePoint;
86     }
87
88     /**
89      * Indicates whether this character reference is terminated by a semicolon (<code>;</code>).
90      * <p>
91      * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does
92      * not end with a semicolon.
93      * <p>
94      * The SGML specification allows unterminated character references in some circumstances, and because the
95      * HTML 4.01 specification states simply that
96      * "<a target="_blank" HREF="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>",
97      * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
98      * <p>
99      * Unterminated character references are not allowed in <a target="_blank" HREF="http://www.w3.org/TR/xhtml1/">XHTML</a> documents.
100      *
101      * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>.
102      * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
103      */

104     public boolean isTerminated() {
105         return source.charAt(end-1)==';';
106     }
107
108     /**
109      * Encodes the specified text, escaping special characters into character references.
110      * <p>
111      * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character,
112      * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
113      * code point is greater than U+007F.
114      * <p>
115      * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
116      * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
117      * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&amp;#39;</code>".
118      * <p>
119      * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &amp;apos;}
120      * as this entity is not defined for use in HTML. See the comments in the {@link CharacterEntityReference} class for more information.
121      * <p>
122      * To encode text using only numeric character references, use the<br />
123      * {@link NumericCharacterReference#encode(CharSequence)} method instead.
124      *
125      * @param unencodedText the text to encode.
126      * @return the encoded string.
127      * @see #decode(CharSequence)
128      */

129     public static String JavaDoc encode(final CharSequence JavaDoc unencodedText) {
130         if (unencodedText==null) return null;
131         return appendEncode(new StringBuffer JavaDoc(unencodedText.length()*2),unencodedText,false).toString();
132     }
133
134     /**
135      * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
136      * <p>
137      * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
138      * <ul>
139      * <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
140      * are converted to "<code>&lt;br /&gt;</code>". CR/LF pairs are treated as a single line break.
141      * <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&amp;nbsp;</code>"
142      * while ensuring the last is always a normal space.
143      * <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
144      * </ul>
145      * <p>
146      * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
147      * spaces to be rendered, but also allows the line to wrap in the middle of it.
148      * <p>
149      * Note that zero-width spaces (U+200B) are converted to the numeric character reference
150      * "<code>&amp;#x200B;</code>" through the normal encoding process, but IE6 does not render them properly
151      * either encoded or unencoded.
152      * <p>
153      * There is no method provided to reverse this encoding.
154      *
155      * @param unencodedText the text to encode.
156      * @return the encoded string with whitespace formatting converted to markup.
157      * @see #encode(CharSequence)
158      */

159     public static String JavaDoc encodeWithWhiteSpaceFormatting(final CharSequence JavaDoc unencodedText) {
160         if (unencodedText==null) return null;
161         return appendEncode(new StringBuffer JavaDoc(unencodedText.length()*2),unencodedText,true).toString();
162     }
163
164     /**
165      * Decodes the specified HTML encoded text into normal text.
166      * <p>
167      * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
168      * are converted to their respective characters.
169      * <p>
170      * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
171      * <p>
172      * <a HREF="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
173      * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
174      * <p>
175      * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
176      * some browsers also recognise them in a case-insensitive way.
177      * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
178      *
179      * @param encodedText the text to decode.
180      * @return the decoded string.
181      * @see #encode(CharSequence)
182      */

183     public static String JavaDoc decode(final CharSequence JavaDoc encodedText) {
184         return decode(encodedText,false);
185     }
186
187     /**
188      * Decodes the specified HTML encoded text into normal text.
189      * <p>
190      * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
191      * are converted to their respective characters.
192      * <p>
193      * <a HREF="#Unterminated">Unterminated</a> character references are dealt with according to the
194      * value of the <code>insideAttributeValue</code> parameter and the
195      * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
196      * <p>
197      * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
198      * some browsers also recognise them in a case-insensitive way.
199      * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
200      *
201      * @param encodedText the text to decode.
202      * @param insideAttributeValue specifies whether the encoded text is inside an attribute value.
203      * @return the decoded string.
204      * @see #decode(CharSequence)
205      * @see #encode(CharSequence)
206      */

207     public static String JavaDoc decode(final CharSequence JavaDoc encodedText, final boolean insideAttributeValue) {
208         if (encodedText==null) return null;
209         final String JavaDoc encodedString=encodedText.toString();
210         final int pos=encodedString.indexOf('&');
211         if (pos==-1) return encodedString;
212         return appendDecode(new StringBuffer JavaDoc(encodedString.length()),encodedString,pos,insideAttributeValue).toString();
213     }
214
215     /**
216      * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
217      * <p>
218      * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
219      * <p>
220      * The result is how the text would normally be rendered by a
221      * <a target="_blank" HREF="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
222      * assuming it does not contain any tags.
223      *
224      * @param text the source text
225      * @return the decoded text with collapsed white space.
226      * @see FormControl#getPredefinedValues()
227      */

228     public static String JavaDoc decodeCollapseWhiteSpace(final CharSequence JavaDoc text) {
229         return decode(appendCollapseWhiteSpace(new StringBuffer JavaDoc(text.length()),text));
230     }
231
232     /**
233      * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
234      * <p>
235      * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
236      * <p>
237      * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
238      * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
239      * may be used in future.
240      *
241      * @param encodedText the text to re-encode.
242      * @return the re-encoded string.
243      */

244     public static String JavaDoc reencode(final CharSequence JavaDoc encodedText) {
245         return encode(decode(encodedText,true));
246     }
247
248     /**
249      * Returns the encoded form of this character reference.
250      * <p>
251      * The exact behaviour of this method depends on the class of this object.
252      * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
253      * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
254      * <p>
255      * <dl>
256      * <dt>Examples:</dt>
257      * <dd><code>CharacterReference.parse("&amp;GT;").getCharacterReferenceString()</code> returns "<code>&amp;gt;</code>"</dd>
258      * <dd><code>CharacterReference.parse("&amp;#x3E;").getCharacterReferenceString()</code> returns "<code>&amp;#3e;</code>"</dd>
259      * </dl>
260      *
261      * @return the encoded form of this character reference.
262      * @see #getCharacterReferenceString(int codePoint)
263      * @see #getDecimalCharacterReferenceString()
264      */

265     public abstract String JavaDoc getCharacterReferenceString();
266
267     /**
268      * Returns the encoded form of the specified unicode code point.
269      * <p>
270      * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
271      * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
272      * <p>
273      * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
274      * which is encoded as the numeric character reference "<code>&amp;#39;</code>" instead of its character entity reference
275      * "<code>&amp;apos;</code>".
276      * <p>
277      * <dl>
278      * <dt>Examples:</dt>
279      * <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&amp;gt;</code>"</dd>
280      * <dd><code>CharacterReference.getCharacterReferenceString('&gt;')</code> returns "<code>&amp;gt;</code>"</dd>
281      * <dd><code>CharacterReference.getCharacterReferenceString('&#9786;')</code> returns "<code>&amp;#9786;</code>"</dd>
282      * </dl>
283      *
284      * @param codePoint the unicode code point to encode.
285      * @return the encoded form of the specified unicode code point.
286      * @see #getHexadecimalCharacterReferenceString(int codePoint)
287      */

288     public static String JavaDoc getCharacterReferenceString(final int codePoint) {
289         String JavaDoc characterReferenceString=null;
290         if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
291         if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
292         return characterReferenceString;
293     }
294
295     /**
296      * Returns the <a HREF="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
297      * <p>
298      * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
299      * <p>
300      * <dl>
301      * <dt>Example:</dt>
302      * <dd><code>CharacterReference.parse("&amp;gt;").getDecimalCharacterReferenceString()</code> returns "<code>&amp;#62;</code>"</dd>
303      * </dl>
304      *
305      * @return the decimal encoded form of this character reference.
306      * @see #getCharacterReferenceString()
307      * @see #getHexadecimalCharacterReferenceString()
308      */

309     public String JavaDoc getDecimalCharacterReferenceString() {
310         return getDecimalCharacterReferenceString(codePoint);
311     }
312
313     /**
314      * Returns the <a HREF="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
315      * <p>
316      * <dl>
317      * <dt>Example:</dt>
318      * <dd><code>CharacterReference.getDecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
319      * </dl>
320      *
321      * @param codePoint the unicode code point to encode.
322      * @return the decimal encoded form of the specified unicode code point.
323      * @see #getCharacterReferenceString(int codePoint)
324      * @see #getHexadecimalCharacterReferenceString(int codePoint)
325      */

326     public static String JavaDoc getDecimalCharacterReferenceString(final int codePoint) {
327         return appendDecimalCharacterReferenceString(new StringBuffer JavaDoc(),codePoint).toString();
328     }
329
330     /**
331      * Returns the <a HREF="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
332      * <p>
333      * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
334      * <p>
335      * <dl>
336      * <dt>Example:</dt>
337      * <dd><code>CharacterReference.parse("&amp;gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&amp;#x3e;</code>"</dd>
338      * </dl>
339      *
340      * @return the hexadecimal encoded form of this character reference.
341      * @see #getCharacterReferenceString()
342      * @see #getDecimalCharacterReferenceString()
343      */

344     public String JavaDoc getHexadecimalCharacterReferenceString() {
345         return getHexadecimalCharacterReferenceString(codePoint);
346     }
347
348     /**
349      * Returns the <a HREF="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
350      * <p>
351      * <dl>
352      * <dt>Example:</dt>
353      * <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#x3e;</code>"</dd>
354      * </dl>
355      *
356      * @param codePoint the unicode code point to encode.
357      * @return the hexadecimal encoded form of the specified unicode code point.
358      * @see #getCharacterReferenceString(int codePoint)
359      * @see #getDecimalCharacterReferenceString(int codePoint)
360      */

361     public static String JavaDoc getHexadecimalCharacterReferenceString(final int codePoint) {
362         return appendHexadecimalCharacterReferenceString(new StringBuffer JavaDoc(),codePoint).toString();
363     }
364
365     /**
366      * Returns the unicode code point of this character reference in <a target="_blank" HREF="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
367      * <p>
368      * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
369      * <p>
370      * <dl>
371      * <dt>Example:</dt>
372      * <dd><code>CharacterReference.parse("&amp;gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
373      * </dl>
374      *
375      * @return the unicode code point of this character reference in U+ notation.
376      * @see #getUnicodeText(int codePoint)
377      */

378     public String JavaDoc getUnicodeText() {
379         return getUnicodeText(codePoint);
380     }
381
382     /**
383      * Returns the specified unicode code point in <a target="_blank" HREF="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
384      * <p>
385      * <dl>
386      * <dt>Example:</dt>
387      * <dd><code>CharacterReference.getUnicodeText('&gt;')</code> returns "<code>U+003E</code>"</dd>
388      * </dl>
389      *
390      * @param codePoint the unicode code point.
391      * @return the specified unicode code point in U+ notation.
392      */

393     public static String JavaDoc getUnicodeText(final int codePoint) {
394         return appendUnicodeText(new StringBuffer JavaDoc(),codePoint).toString();
395     }
396
397     static final StringBuffer JavaDoc appendUnicodeText(final StringBuffer JavaDoc sb, final int codePoint) {
398         sb.append("U+");
399         final String JavaDoc hex=Integer.toString(codePoint,16).toUpperCase();
400         for (int i=4-hex.length(); i>0; i--) sb.append('0');
401         sb.append(hex);
402         return sb;
403     }
404
405     /**
406      * Parses a single encoded character reference text into a <code>CharacterReference</code> object.
407      * <p>
408      * The character reference must be at the start of the given text, but may contain other characters at the end.
409      * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
410      * <p>
411      * If the text does not represent a valid character reference, this method returns <code>null</code>.
412      * <p>
413      * <a HREF="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
414      * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
415      * <p>
416      * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
417      * <p>
418      * <dl>
419      * <dt>Example:</dt>
420      * <dd><code>CharacterReference.parse("&amp;gt;").getChar()</code> returns '<code>&gt;</code>'</dd>
421      * </dl>
422      *
423      * @param characterReferenceText the text containing a single encoded character reference.
424      * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
425      * @see #decode(CharSequence)
426      */

427     public static CharacterReference parse(final CharSequence JavaDoc characterReferenceText) {
428         return construct(new Source(characterReferenceText.toString()),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
429     }
430
431     /**
432      * Parses a single encoded character reference text into a unicode code point.
433      * <p>
434      * The character reference must be at the start of the given text, but may contain other characters at the end.
435      * <p>
436      * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
437      * <p>
438      * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()},
439      * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
440      * <code>NullPointerException</code>.
441      * <p>
442      * <dl>
443      * <dt>Example:</dt>
444      * <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&amp;gt;")</code> returns <code>38</code></dd>
445      * </dl>
446      *
447      * @param characterReferenceText the text containing a single encoded character reference.
448      * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
449      */

450     public static int getCodePointFromCharacterReferenceString(final CharSequence JavaDoc characterReferenceText) {
451         final CharacterReference characterReference=parse(characterReferenceText);
452         return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT;
453     }
454
455     /**
456      * Indicates whether the specified character would need to be encoded in HTML text.
457      * <p>
458      * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
459      * <p>
460      * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
461      * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property
462      * is currently set to <code>true</code>.
463      *
464      * @param ch the character to test.
465      * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>.
466      */

467     public static final boolean requiresEncoding(final char ch) {
468         return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded));
469     }
470
471     static StringBuffer JavaDoc appendEncode(final StringBuffer JavaDoc sb, CharSequence JavaDoc unencodedText, final boolean whiteSpaceFormatting) {
472         if (unencodedText==null) return sb;
473         int beginPos=0;
474         int endPos=unencodedText.length();
475         if (unencodedText instanceof Segment) {
476             // this might improve performance slightly
477
final Segment segment=(Segment)unencodedText;
478             final int segmentOffset=segment.getBegin();
479             beginPos=segmentOffset;
480             endPos+=segmentOffset;
481             unencodedText=segment.source.string;
482         }
483         final boolean isApostropheEncoded=Config.IsApostropheEncoded;
484         for (int i=beginPos; i<endPos; i++) {
485             char ch=unencodedText.charAt(i);
486             final String JavaDoc characterEntityReferenceName=CharacterEntityReference.getName(ch);
487             if (characterEntityReferenceName!=null) {
488                 if (ch=='\'') {
489                     if (isApostropheEncoded)
490                         sb.append("&#39;");
491                     else
492                         sb.append(ch);
493                 } else {
494                     CharacterEntityReference.appendCharacterReferenceString(sb,characterEntityReferenceName);
495                 }
496             } else if (ch>127) {
497                 appendDecimalCharacterReferenceString(sb,ch);
498             } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
499                 sb.append(ch);
500             } else {
501                 // whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup
502
int spaceCount;
503                 int nexti=i+1;
504                 if (ch!=' ') {
505                     if (ch!='\t') {
506                         // must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string
507
if (ch=='\r' && nexti<endPos && unencodedText.charAt(nexti)=='\n') i++; // process cr/lf pair as one line break
508
sb.append("<br />"); // add line break
509
continue;
510                     } else {
511                         spaceCount=TAB_LENGTH;
512                     }
513                 } else {
514                     spaceCount=1;
515                 }
516                 while (nexti<endPos) {
517                     ch=unencodedText.charAt(nexti);
518                     if (ch==' ')
519                         spaceCount+=1;
520                     else if (ch=='\t')
521                         spaceCount+=TAB_LENGTH;
522                     else
523                         break;
524                     nexti++;
525                 }
526                 if (spaceCount==1) {
527                     // handle the very common case of a single character to improve efficiency slightly
528
sb.append(' ');
529                     continue;
530                 }
531                 if (spaceCount%2==1) sb.append(' '); // fist character is a space if we have an odd number of spaces
532
while (spaceCount>=2) {
533                     sb.append("&nbsp; "); // use alternating &nbsp; and spaces to keep original number of spaces
534
spaceCount-=2;
535                 }
536                 // note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
537
i=nexti-1; // minus 1 because top level for loop will add it again
538
}
539         }
540         return sb;
541     }
542
543     static CharacterReference findPreviousOrNext(final Source source, final int pos, final boolean previous) {
544         return findPreviousOrNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL,previous);
545     }
546
547     private static CharacterReference findPreviousOrNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings, final boolean previous) {
548         final ParseText parseText=source.getParseText();
549         pos=previous ? parseText.lastIndexOf('&',pos) : parseText.indexOf('&',pos);
550         while (pos!=-1) {
551             final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
552             if (characterReference!=null) return characterReference;
553             pos=previous ? parseText.lastIndexOf('&',pos-1) : parseText.indexOf('&',pos+1);
554         }
555         return null;
556     }
557
558     static final StringBuffer JavaDoc appendHexadecimalCharacterReferenceString(final StringBuffer JavaDoc sb, final int codePoint) {
559         return sb.append("&#x").append(Integer.toString(codePoint,16)).append(';');
560     }
561
562     static final StringBuffer JavaDoc appendDecimalCharacterReferenceString(final StringBuffer JavaDoc sb, final int codePoint) {
563         return sb.append("&#").append(codePoint).append(';');
564     }
565
566     private static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
567         try {
568             if (source.getParseText().charAt(begin)!='&') return null;
569             return (source.getParseText().charAt(begin+1)=='#')
570                 ? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings)
571                 : CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
572         } catch (IndexOutOfBoundsException JavaDoc ex) {
573             return null;
574         }
575     }
576
577     private static StringBuffer JavaDoc appendDecode(final StringBuffer JavaDoc sb, final String JavaDoc encodedString, int pos, final boolean insideAttributeValue) {
578         final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
579         int lastEnd=0;
580         final Source source=new Source(encodedString);
581         while (true) {
582             final CharacterReference characterReference=findPreviousOrNext(source,pos,unterminatedCharacterReferenceSettings,false);
583             if (characterReference==null) break;
584             if (lastEnd!=characterReference.getBegin()) Util.appendTo(sb,encodedString,lastEnd,characterReference.getBegin());
585             sb.append((char)characterReference.codePoint);
586             pos=lastEnd=characterReference.getEnd();
587         }
588         if (lastEnd!=encodedString.length()) Util.appendTo(sb,encodedString,lastEnd,encodedString.length());
589         return sb;
590     }
591 }
592
Popular Tags