KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > au > id > jericho > lib > html > NumericCharacterReference


1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2
// Version 2.2
3
// Copyright (C) 2006 Martin Jericho
4
// http://sourceforge.net/projects/jerichohtml/
5
//
6
// This library is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 2.1 of the License, or (at your option) any later version.
10
// http://www.gnu.org/copyleft/lesser.html
11
//
12
// This library is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
// Lesser General Public License for more details.
16
//
17
// You should have received a copy of the GNU Lesser General Public
18
// License along with this library; if not, write to the Free Software
19
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20

21 package au.id.jericho.lib.html;
22
23 import java.util.*;
24
25 /**
26  * Represents an HTML <a target="_blank" HREF="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1">Numeric Character Reference</a>.
27  * <p>
28  * A numeric character reference can be one of two types:
29  * <dl>
30  * <dt><a name="DecimalCharacterReference">Decimal Character Reference</a>
31  * <dd>A numeric character reference specifying the unicode code point in decimal notation.<br />
32  * This is signified by the absence of an '<code>x</code>' character after the '<code>#</code>', (eg "<code>&amp;#62;</code>").
33  * <dt><a name="HexadecimalCharacterReference">Hexadecimal Character Reference</a>
34  * <dd>A numeric character reference specifying the unicode code point in hexadecimal notation.<br />
35  * This is signified by the presence of an '<code>x</code>' character after the '<code>#</code>', (eg "<code>&amp;#x3e;</code>").
36  * </dl>
37  * <p>
38  * Static methods to {@linkplain #encode(CharSequence) encode} and {@linkplain #decode(CharSequence) decode} strings
39  * and single characters can be found in the {@link CharacterReference} superclass.
40  * <p>
41  * <code>NumericCharacterReference</code> instances are obtained using one of the following methods:
42  * <ul>
43  * <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
44  * <li>{@link Source#findNextCharacterReference(int pos)}
45  * <li>{@link Source#findPreviousCharacterReference(int pos)}
46  * <li>{@link Segment#findAllCharacterReferences()}
47  * </ul>
48  *
49  * @see CharacterReference
50  * @see CharacterEntityReference
51  */

52 public class NumericCharacterReference extends CharacterReference {
53     private boolean hex;
54
55     private NumericCharacterReference(final Source source, final int begin, final int end, final int codePoint, final boolean hex) {
56         super(source,begin,end,codePoint);
57         this.hex=hex;
58     }
59
60     /**
61      * Indicates whether this numeric character reference specifies the unicode code point in decimal format.
62      * <p>
63      * A numeric character reference in decimal format is referred to in this library as a
64      * <a HREF="#DecimalCharacterReference">decimal character reference</a>.
65      *
66      * @return <code>true</code> if this numeric character reference specifies the unicode code point in decimal format, otherwise <code>false</code>.
67      * @see #isHexadecimal()
68      */

69     public boolean isDecimal() {
70         return !hex;
71     }
72
73     /**
74      * Indicates whether this numeric character reference specifies the unicode code point in hexadecimal format.
75      * <p>
76      * A numeric character reference in hexadecimal format is referred to in this library as a
77      * <a HREF="#HexadecimalCharacterReference">hexadecimal character reference</a>.
78      *
79      * @return <code>true</code> if this numeric character reference specifies the unicode code point in hexadecimal format, otherwise <code>false</code>.
80      * @see #isDecimal()
81      */

82     public boolean isHexadecimal() {
83         return hex;
84     }
85
86     /**
87      * Encodes the specified text, escaping special characters into numeric character references.
88      * <p>
89      * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
90      * <p>
91      * This method encodes all character references in <a HREF="#DecimalCharacterReference">decimal format</a>, and is exactly the same as calling
92      * {@link #encodeDecimal(CharSequence)}.
93      * <p>
94      * To encode text using both character entity references and numeric character references, use the<br />
95      * {@link CharacterReference#encode(CharSequence)} method instead.
96      * <p>
97      * To encode text using <a HREF="#HexadecimalCharacterReference">hexadecimal character references</a> only,
98      * use the {@link #encodeHexadecimal(CharSequence)} method instead.
99      *
100      * @param unencodedText the text to encode.
101      * @return the encoded string.
102      * @see #decode(CharSequence)
103      */

104     public static String JavaDoc encode(final CharSequence JavaDoc unencodedText) {
105         if (unencodedText==null) return null;
106         final StringBuffer JavaDoc sb=new StringBuffer JavaDoc(unencodedText.length()*2);
107         for (int i=0; i<unencodedText.length(); i++) {
108             final char ch=unencodedText.charAt(i);
109             if (requiresEncoding(ch)) {
110                 appendDecimalCharacterReferenceString(sb,ch);
111             } else {
112                 sb.append(ch);
113             }
114         }
115         return sb.toString();
116     }
117
118     /**
119      * Encodes the specified text, escaping special characters into <a HREF="#DecimalCharacterReference">decimal character references</a>.
120      * <p>
121      * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
122      * <p>
123      * To encode text using both character entity references and numeric character references, use the<br />
124      * {@link CharacterReference#encode(CharSequence)} method instead.
125      * <p>
126      * To encode text using <a HREF="#HexadecimalCharacterReference">hexadecimal character references</a> only,
127      * use the {@link #encodeHexadecimal(CharSequence)} method instead.
128      *
129      * @param unencodedText the text to encode.
130      * @return the encoded string.
131      * @see #decode(CharSequence)
132      */

133     public static String JavaDoc encodeDecimal(final CharSequence JavaDoc unencodedText) {
134         return encode(unencodedText);
135     }
136
137     /**
138      * Encodes the specified text, escaping special characters into <a HREF="#HexadecimalCharacterReference">hexadecimal character references</a>.
139      * <p>
140      * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
141      * <p>
142      * To encode text using both character entity references and numeric character references, use the<br />
143      * {@link CharacterReference#encode(CharSequence)} method instead.
144      * <p>
145      * To encode text using <a HREF="#DecimalCharacterReference">decimal character references</a> only,
146      * use the {@link #encodeDecimal(CharSequence)} method instead.
147      *
148      * @param unencodedText the text to encode.
149      * @return the encoded string.
150      * @see #decode(CharSequence)
151      */

152     public static String JavaDoc encodeHexadecimal(final CharSequence JavaDoc unencodedText) {
153         if (unencodedText==null) return null;
154         final StringBuffer JavaDoc sb=new StringBuffer JavaDoc(unencodedText.length()*2);
155         for (int i=0; i<unencodedText.length(); i++) {
156             final char ch=unencodedText.charAt(i);
157             if (requiresEncoding(ch)) {
158                 appendHexadecimalCharacterReferenceString(sb,ch);
159             } else {
160                 sb.append(ch);
161             }
162         }
163         return sb.toString();
164     }
165
166     /**
167      * Returns the correct encoded form of this numeric character reference.
168      * <p>
169      * The returned string uses the same radix as the original character reference in the source document,
170      * i.e. decimal format if {@link #isDecimal()} is <code>true</code>, and hexadecimal format if {@link #isHexadecimal()} is <code>true</code>.
171      * <p>
172      * Note that the returned string is not necessarily the same as the original source text used to create this object.
173      * This library recognises certain invalid forms of character references,
174      * as detailed in the {@link #decode(CharSequence) decode(CharSequence)} method.
175      * <p>
176      * To retrieve the original source text, use the {@link #toString() toString()} method instead.
177      * <p>
178      * <dl>
179      * <dt>Example:</dt>
180      * <dd><code>CharacterReference.parse("&amp;#62").getCharacterReferenceString()</code> returns "<code>&amp;#62;</code>"</dd>
181      * </dl>
182      *
183      * @return the correct encoded form of this numeric character reference.
184      * @see CharacterReference#getCharacterReferenceString(int codePoint)
185      */

186     public String JavaDoc getCharacterReferenceString() {
187         return hex ? getHexadecimalCharacterReferenceString(codePoint) : getDecimalCharacterReferenceString(codePoint);
188     }
189
190     /**
191      * Returns the numeric character reference encoded form of the specified unicode code point.
192      * <p>
193      * This method returns the character reference in decimal format, and is exactly the same as calling
194      * {@link #getDecimalCharacterReferenceString(int codePoint)}.
195      * <p>
196      * To get either the character entity reference or numeric character reference, use the<br />
197      * {@link CharacterReference#getCharacterReferenceString(int codePoint)} method instead.
198      * <p>
199      * To get the character reference in hexadecimal format, use the {@link #getHexadecimalCharacterReferenceString(int codePoint)} method instead.
200      * <p>
201      * <dl>
202      * <dt>Examples:</dt>
203      * <dd><code>NumericCharacterReference.getCharacterReferenceString(62)</code> returns "<code>&amp;#62;</code>"</dd>
204      * <dd><code>NumericCharacterReference.getCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
205      * </dl>
206      *
207      * @return the numeric character reference encoded form of the specified unicode code point.
208      * @see CharacterReference#getCharacterReferenceString(int codePoint)
209      */

210     public static String JavaDoc getCharacterReferenceString(final int codePoint) {
211         return getDecimalCharacterReferenceString(codePoint);
212     }
213
214     static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
215         // only called from CharacterReference.construct(), so we can assume that first characters are "&#"
216
final ParseText parseText=source.getParseText();
217         int codePointStringBegin=begin+2;
218         boolean hex;
219         if (hex=(parseText.charAt(codePointStringBegin)=='x')) codePointStringBegin++;
220         final int unterminatedMaxCodePoint=hex ? unterminatedCharacterReferenceSettings.hexadecimalCharacterReferenceMaxCodePoint : unterminatedCharacterReferenceSettings.decimalCharacterReferenceMaxCodePoint;
221         final int maxSourcePos=parseText.length()-1;
222         String JavaDoc codePointString;
223         int end;
224         int x=codePointStringBegin;
225         boolean unterminated=false;
226         while (true) {
227             final char ch=parseText.charAt(x);
228             if (ch==';') {
229                 end=x+1;
230                 codePointString=parseText.substring(codePointStringBegin,x);
231                 break;
232             }
233             if ((ch<'0' || ch>'9') && (!hex || ch<'a' || ch>'f')) {
234                 // At this point we were either expecting a decimal digit (if hex is false), or a hexadecimal digit (if hex is true),
235
// but have found something else, meaning the character reference is unterminated.
236
unterminated=true;
237             } else if (x==maxSourcePos) {
238                 // At this point, we have a valid digit but are at the last position in the source text without the terminating semicolon.
239
unterminated=true;
240                 x++; // include this digit
241
}
242             if (unterminated) {
243                 // Different browsers react differently to unterminated numeric character references.
244
// The behaviour of this method is determined by the settings in the unterminatedCharacterReferenceSettings parameter.
245
if (unterminatedMaxCodePoint==INVALID_CODE_POINT) {
246                     // reject:
247
return null;
248                 } else {
249                     // accept:
250
end=x;
251                     codePointString=parseText.substring(codePointStringBegin,x);
252                     break;
253                 }
254             }
255             x++;
256         }
257         if (codePointString.length()==0) return null;
258         int codePoint=INVALID_CODE_POINT;
259         try {
260             codePoint=Integer.parseInt(codePointString,hex?16:10);
261             if (unterminated && codePoint>unterminatedMaxCodePoint) return null;
262             if (codePoint>MAX_CODE_POINT) codePoint=INVALID_CODE_POINT;
263         } catch (NumberFormatException JavaDoc ex) {
264             // This should only happen if number is larger than Integer.MAX_VALUE.
265
if (unterminated) return null;
266             // If it is a terminated reference just ignore the exception as codePoint will remain with its value of INVALID_CODE_POINT.
267
}
268         return new NumericCharacterReference(source,begin,end,codePoint,hex);
269     }
270
271     public String JavaDoc getDebugInfo() {
272         final StringBuffer JavaDoc sb=new StringBuffer JavaDoc();
273         sb.append('"');
274         if (hex)
275             appendHexadecimalCharacterReferenceString(sb,codePoint);
276         else
277             appendDecimalCharacterReferenceString(sb,codePoint);
278         sb.append("\" ");
279         appendUnicodeText(sb,codePoint);
280         sb.append(' ').append(super.getDebugInfo());
281         return sb.toString();
282     }
283 }
284
285
Popular Tags