KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > au > id > jericho > lib > html > Source


1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2
// Version 2.2
3
// Copyright (C) 2006 Martin Jericho
4
// http://sourceforge.net/projects/jerichohtml/
5
//
6
// This library is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 2.1 of the License, or (at your option) any later version.
10
// http://www.gnu.org/copyleft/lesser.html
11
//
12
// This library is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
// Lesser General Public License for more details.
16
//
17
// You should have received a copy of the GNU Lesser General Public
18
// License along with this library; if not, write to the Free Software
19
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20

21 package au.id.jericho.lib.html;
22
23 import java.util.*;
24 import java.io.*;
25 import java.net.*;
26
27 /**
28  * Represents a source HTML document.
29  * <p>
30  * The first step in parsing an HTML document is always to construct a <code>Source</code> object from the source data, which can be a
31  * <code>String</code>, <code>Reader</code>, <code>InputStream</code> or <code>URL</code>.
32  * Each constructor uses all the evidence available to determine the original {@linkplain #getEncoding() character encoding} of the data.
33  * <p>
34  * Once the <code>Source</code> object has been created, you can immediately start searching for {@linkplain Tag tags} or {@linkplain Element elements} within the document
35  * using the <a HREF="Tag.html#TagSearchMethods">tag search methods</a>.
36  * It is strongly advised however to first think about how many of the document's tags you will need to parse.
37  * If you will be searching for all or most of the tags, performance can be greatly improved by first calling the {@link #fullSequentialParse()} method.
38  * If you only need to parse a few tags, performance will probably be better if you use the default <a HREF="#ParseOnDemand">parse on demand</a> mode.
39  * <p>
40  * It can also be useful to {@linkplain #setLogWriter(Writer) set the location of the log writer} before calling any tag search methods
41  * so that important log messages can be traced while the document is being parsed.
42  * <p>
43  * Note that many of the useful functions which can be performed on the source document are
44  * defined in its superclass, {@link Segment}.
45  * The source object is itself a segment which spans the entire document.
46  * <p>
47  * Most of the methods defined in this class are useful for determining the elements and tags
48  * surrounding or neighbouring a particular character position in the document.
49  * <p>
50  * For information on how to create a modified version of this source document, see the {@link OutputDocument} class.
51  *
52  * @see Segment
53  */

54 public class Source extends Segment {
55     final String JavaDoc string;
56     String JavaDoc documentSpecifiedEncoding=UNINITIALISED;
57     String JavaDoc encoding=UNINITIALISED;
58     String JavaDoc encodingSpecificationInfo;
59     private ParseText parseText=null;
60     private OutputDocument parseTextOutputDocument=null;
61     private Writer logWriter=null;
62     private RowColumnVector[] rowColumnVectorCacheArray=null;
63     final Cache cache=new Cache(this);
64     boolean useAllTypesCache=true;
65     boolean useSpecialTypesCache=true;
66     int endOfLastTagIgnoringEnclosedMarkup=-1; // Always has a value of -1 unless doing full sequential parse. Used in TagType.isValidPosition() method.
67
// cached result lists:
68
Tag[] allTagsArray=null; // non-null iff fullSequentialParse was called
69
List allTags=null;
70     List allStartTags=null;
71     private List allElements=null;
72
73     private static final String JavaDoc UNINITIALISED="";
74
75     /**
76      * Constructs a new <code>Source</code> object from the specified text.
77      * @param text the source text.
78      * @see #setLogWriter(Writer)
79      */

80     public Source(final CharSequence JavaDoc text) {
81         super(text.length());
82         string=text.toString();
83     }
84
85     private Source(final EncodedSource encodedSource) throws IOException {
86         this(Util.getString(encodedSource.Reader));
87         encoding=encodedSource.Encoding;
88         encodingSpecificationInfo=encodedSource.EncodingSpecificationInfo;
89         // if (encodedSource.HttpURLConnection!=null) encodedSource.HttpURLConnection.disconnect();
90
}
91
92     private Source(final Reader reader, final String JavaDoc inputStreamReaderEncoding) throws IOException {
93         this(Util.getString(reader));
94         if (inputStreamReaderEncoding!=null) {
95             encoding=inputStreamReaderEncoding;
96             encodingSpecificationInfo="InputStreamReader.getEncoding() of constructor argument";
97         }
98     }
99
100     /**
101      * Constructs a new <code>Source</code> object by loading the content from the specified <code>Reader</code>.
102      * <p>
103      * If the specified reader is an instance of <code>InputStreamReader</code>, the {@link #getEncoding()} method of the
104      * created source object returns the encoding from <code>InputStreamReader.getEncoding()</code>.
105      *
106      * @param reader the <code>java.io.Reader</code> from which to load the source text.
107      * @throws java.io.IOException if an I/O error occurs.
108      * @see #setLogWriter(Writer)
109      */

110     public Source(final Reader reader) throws IOException {
111         this(reader,(reader instanceof InputStreamReader) ? ((InputStreamReader)reader).getEncoding() : null);
112     }
113
114     /**
115      * Constructs a new <code>Source</code> object by loading the content from the specified <code>InputStream</code>.
116      * <p>
117      * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document from the raw bytes
118      * of the specified input stream is the same as that for the {@link #Source(URL)} constructor with the following exceptions:
119      * <ul class="HalfSeparated">
120      * <li>Step 1 is not possible as there is no <code>Content-Type</code> header to check.
121      * <li>Step 6 is not performed as it is not possible to know whether the input stream was aquired from an HTTP connection.
122      * </ul>
123      *
124      * @param inputStream the <code>java.io.InputStream</code> from which to load the source text.
125      * @throws java.io.IOException if an I/O error occurs.
126      * @see #getEncoding()
127      * @see #setLogWriter(Writer)
128      */

129     public Source(final InputStream inputStream) throws IOException {
130         this(EncodedSource.construct(inputStream,null));
131     }
132
133     /**
134      * Constructs a new <code>Source</code> object by loading the content from the specified URL.
135      * <p>
136      * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document is as follows:
137      * <ol class="HalfSeparated">
138      * <li>If the <code>URLConnection.getContentType()</code> specifies an encoding
139      * (where a <code>charset</code> parameter is included in the value of the the stream's
140      * <a target="_blank" HREF="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header),
141      * then this is used to decode the input stream and is returned verbatim by the {@link #getEncoding()}
142      * method of the created source object. Otherwise:
143      * <li>Get the content input stream via the <code>URLConnection.getInputStream()</code> method.
144      * <li>If the input stream is empty, the created source document has zero length and its {@link #getEncoding()} method
145      * returns <code>null</code>. Otherwise:
146      * <li>Determine a <i>preliminary encoding</i> by examining the first 4 bytes of the input stream:
147      * <ul class="Unseparated">
148      * <li>If the first two bytes match the byte order mark (U+FEFF) in either big or little endian order:
149      * <ul>
150      * <li>If the third byte is 00, assume a 32-bit encoding (UTF-32).
151      * <li>Otherwise, assume a 16-bit encoding (UTF-16).
152      * </ul>
153      * <li>If the first byte is 00:
154      * <ul>
155      * <li>If the second or fourth byte is 00, assume a 32-bit encoding (UTF-32).
156      * <li>Otherwise, assume a big endian 16-bit encoding without byte order mark (UTF-16BE).
157      * </ul>
158      * <li>If the second byte is 00:
159      * <ul>
160      * <li>If the third byte is 00, assume a 32-bit encoding (UTF-32).
161      * <li>Otherwise, assume a little endian 16-bit encoding without byte order mark (UTF-16LE).
162      * </ul>
163      * <li>If the first four bytes match the EBDIC encoding of "<code>&lt;?xm</code>", the preliminary encoding is Cp037.
164      * <li>Otherwise, assume an 8-bit encoding (UTF-8).
165      * </ul>
166      * <li>Preview the first 2048 characters of the source document (hereafter referred to as the <i>preview segment</i>)
167      * using the preliminary encoding. If the preview segment contains an <a HREF="#EncodingSpecification">encoding specification</a>
168      * (which is always at or near the top of the document),
169      * the specified encoding is used to decode the input stream and is returned verbatim
170      * by the {@link #getEncoding()} method of the created source object. Otherwise:
171      * <li>If the preview segment does not contain an encoding specification, and the <code>URLConnection</code> is an instance of
172      * <code>HttpURLConnection</code>, then the
173      * <a target="_blank" HREF="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1">HTTP protocol section 3.7.1</a>
174      * specifies that an encoding of ISO-8859-1 can be assumed.
175      * An XML document should not assume this as it would require an XML declaration to specify this encoding, which would have been
176      * detected in one of the previous steps. So if the preview segment {@linkplain #isXML() is not determined to be XML},
177      * and the preliminary encoding is 8-bit, then the encoding ISO-8859-1 is used to decode the input stream
178      * and is returned by the {@link #getEncoding()} method of the created source object.
179      * <li>Otherwise, the preliminary encoding is used to decode the input stream
180      * and is returned by the {@link #getEncoding()} method of the created source object.
181      * </ol>
182      *
183      * @param url the URL from which to load the source text.
184      * @throws java.io.IOException if an I/O error occurs.
185      * @see #getEncoding()
186      * @see #setLogWriter(Writer)
187      */

188     public Source(final URL url) throws IOException {
189         this(EncodedSource.construct(url));
190     }
191
192     private String JavaDoc setEncoding(final String JavaDoc encoding, final String JavaDoc encodingSpecificationInfo) {
193         if (this.encoding==UNINITIALISED) {
194             this.encoding=encoding;
195             this.encodingSpecificationInfo=encodingSpecificationInfo;
196         }
197         return encoding;
198     }
199
200     /**
201      * Returns the document {@linkplain #getEncoding() encoding} specified within the text of the document.
202      * <p>
203      * The document encoding can be specified within the document text in two ways.
204      * They are referred to generically in this library as an <i><a name="EncodingSpecification">encoding specification</a></i>,
205      * and are listed below in order of precedence:
206      * <ol class="HalfSeparated">
207      * <li>
208      * An <a target="_blank" HREF="http://www.w3.org/TR/REC-xml/#sec-TextDecl">XML text declaration</a> at the start of the document,
209      * which is essentially an {@linkplain StartTagType#XML_DECLARATION XML declaration} with an <code>encoding</code> attribute.
210      * This is only used in XML documents, and must be present if an XML document has an encoding other than UTF-8 or UTF-16.
211      * <pre>&lt;?xml version="1.0" encoding="ISO-8859-1" ?&gt;</pre>
212      * <li>
213      * A <a target="_blank" HREF="http://www.w3.org/TR/html401/charset.html#spec-char-encoding">META declaration</a>,
214      * which is in the form of a {@link HTMLElementName#META META} tag with attribute <code>http-equiv="Content-Type"</code>.
215      * The encoding is specified in the <code>charset</code> parameter of a
216      * <code><a target="_blank" HREF="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a></code>
217      * HTTP header value, which is placed in the value of the meta tag's <code>content</code> attribute.
218      * This META declaration should appear as early as possible in the {@link HTMLElementName#HEAD HEAD} element.
219      * <pre>&lt;META http-equiv=Content-Type content="text/html; charset=iso-8859-1"&gt;</pre>
220      * </ol>
221      * <p>
222      * Both of these tags must only use unicode characters in the range U+0000 to U+007F, and in the case of the META declaration
223      * must use ASCII encoding. This, along with the fact that they must occur at or near the beginning of the document,
224      * assists in their detection and decoding without the need to know the exact encoding of the full text.
225      *
226      * @return the document {@linkplain #getEncoding() encoding} specified within the text of the document.
227      * @see #getEncoding()
228      */

229     public String JavaDoc getDocumentSpecifiedEncoding() {
230         if (documentSpecifiedEncoding!=UNINITIALISED) return documentSpecifiedEncoding;
231         final Tag xmlDeclarationTag=getTagAt(0);
232         if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) {
233             documentSpecifiedEncoding=((StartTag)xmlDeclarationTag).getAttributeValue("encoding");
234             if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,xmlDeclarationTag.toString());
235         }
236         // Check for Content-Type http-equiv meta tag:
237
final StartTag contentTypeMetaTag=findNextStartTag(0,"http-equiv","Content-Type",false);
238         if (contentTypeMetaTag!=null) {
239             final String JavaDoc contentValue=contentTypeMetaTag.getAttributeValue("content");
240             if (contentValue!=null) {
241                 documentSpecifiedEncoding=getCharsetParameterFromHttpHeaderValue(contentValue);
242                 if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,contentTypeMetaTag.toString());
243             }
244         }
245         return setEncoding(null,"no encoding specified in document");
246     }
247
248     /**
249      * Returns the original encoding of the source document.
250      * <p>
251      * The encoding of a document defines how the original byte stream was encoded into characters.
252      * The <a taget="_blank" HREF="http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.4">HTTP specification section 3.4</a>
253      * defines the term "character set" to refer to the encoding, and the term "charset" is similarly used in Java
254      * (see the class <code>java.nio.charset.Charset</code>). This is an unfortunate convention that often causes confusion,
255      * as a character set is not the same thing as a character encoding.
256      * For example, the <a target="_blank" HREF="http://www.unicode.org/">Unicode</a> character set has several encodings, such as
257      * <a target="_blank" HREF="http://www.unicode.org/faq/utf_bom.html">UTF-8, UTF-16, and UTF-32</a>.
258      * <p>
259      * This method makes the best possible effort to return the name of the encoding used to decode the original source text byte stream
260      * into character data. This decoding takes place in the constructor when a parameter based on a byte stream such as an
261      * <code>InputStream</code> or <code>URL</code> is used to specify the source text.
262      * The documentation of the {@link #Source(InputStream)} and {@link #Source(URL)} constructors describe how the return value of this
263      * method is determined in these cases.
264      * It is also possible in some circumstances for the encoding to be determined in the {@link #Source(Reader)} constructor.
265      * <p>
266      * If a constructor was used that specifies the source text directly in character form (not requiring the decoding of a byte sequence)
267      * then the document itself is searched for an <a HREF="#EncodingSpecification">encoding specification</a>. In this case, this
268      * method returns the same value as the {@link #getDocumentSpecifiedEncoding()} method.
269      * <p>
270      * The {@link #getEncodingSpecificationInfo()} method returns a simple description of how the value of this method was determined.
271      *
272      * @return the original encoding of the source document.
273      * @see #getEncodingSpecificationInfo()
274      */

275     public String JavaDoc getEncoding() {
276         if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding();
277         return encoding;
278     }
279
280     /**
281      * Returns a simple description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
282      * <p>
283      * The description is intended for informational purposes only.
284      * It is not guaranteed to have any particular format and can not be reliably parsed.
285      *
286      * @return a simple description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
287      * @see #getEncoding()
288      */

289     public String JavaDoc getEncodingSpecificationInfo() {
290         if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding();
291         return encodingSpecificationInfo;
292     }
293
294     /**
295      * Indicates whether the source document is likely to be <a target="_blank" HREF="http://www.w3.org/TR/REC-xml/">XML</a>.
296      * <p>
297      * The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in
298      * most normal situations.
299      * An exact determination of whether the source document is XML would require a much more complex analysis of the text.
300      * <p>
301      * The algorithm is as follows:
302      * <p>
303      * <ol>
304      * <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document.
305      * <li>If the document contains a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text
306      * "<code>xhtml</code>", it is an <a target="_blank" HREF="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence
307      * also an XML document.
308      * <li>If the document does NOT have an {@link HTMLElementName#HTML HTML} element, assume it is XML.
309      * This assumption is based on the premise that the library is used to parse HTML or XML documents only.
310      * <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document.
311      * </ol>
312      *
313      * @return <code>true</code> if the source document is likely to be <a target="_blank" HREF="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>.
314      */

315     public boolean isXML() {
316         final Tag xmlDeclarationTag=getTagAt(0);
317         if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) return true;
318         final Tag doctypeTag=findNextTag(0,StartTagType.DOCTYPE_DECLARATION);
319         // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
320
if (doctypeTag!=null && getParseText().indexOf("xhtml",doctypeTag.begin,doctypeTag.end)!=-1) return true;
321         // if document doesn't have an HTML element, it is also most likely an XML document, otherwise assume it is normal HTML:
322
return findNextStartTag(0,HTMLElementName.HTML)==null;
323     }
324
325     /**
326      * Returns the row number of the specified character position in the source document.
327      * @param pos the position in the source document.
328      * @return the row number of the specified character position in the source document.
329      * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
330      * @see #getColumn(int pos)
331      * @see #getRowColumnVector(int pos)
332      */

333     public int getRow(final int pos) {
334         return getRowColumnVector(pos).getRow();
335     }
336
337     /**
338      * Returns the column number of the specified character position in the source document.
339      * @param pos the position in the source document.
340      * @return the column number of the specified character position in the source document.
341      * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
342      * @see #getRow(int pos)
343      * @see #getRowColumnVector(int pos)
344      */

345     public int getColumn(final int pos) {
346         return getRowColumnVector(pos).getColumn();
347     }
348
349     /**
350      * Returns a {@link RowColumnVector} object representing the row and column number of the specified character position in the source document.
351      * @param pos the position in the source document.
352      * @return a {@link RowColumnVector} object representing the row and column number of the specified character position in the source document.
353      * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
354      * @see #getRow(int pos)
355      * @see #getColumn(int pos)
356      */

357     public RowColumnVector getRowColumnVector(final int pos) {
358         if (pos>end) throw new IndexOutOfBoundsException JavaDoc();
359         if (rowColumnVectorCacheArray==null) rowColumnVectorCacheArray=RowColumnVector.getCacheArray(this);
360         return RowColumnVector.get(rowColumnVectorCacheArray,pos);
361     }
362     
363     /**
364      * Returns the source text as a <code>String</code>.
365      * @return the source text as a <code>String</code>.
366      */

367     public String JavaDoc toString() {
368         return string;
369     }
370
371     /**
372      * Parses all of the {@linkplain Tag tags} in this source document sequentially from beginning to end.
373      * <p>
374      * Calling this method can greatly improve performance if most or all of the tags in the document need to be parsed.
375      * It is typically called before any of the <a HREF="Tag.html#TagSearchMethods">tag search methods</a> are called on this <code>Source</code> object,
376      * directly after {@linkplain #setLogWriter(Writer) setting the location of the log writer}.
377      * <p>
378      * By default, tags are parsed only as needed, which is referred to as <i><a name="ParseOnDemand">parse on demand</a></i> mode.
379      * In this mode, every call to a tag search method that is not returning previously cached tags must perform a relatively complex check to determine whether a
380      * potential tag is in a {@linkplain TagType#isValidPosition(Source,int) valid position}.
381      * <p>
382      * Generally speaking, a tag is in a valid position if it does not appear inside any another tag.
383      * {@linkplain TagType#isServerTag() Server tags} can appear anywhere in a document, including inside other tags, so this relates only to non-server tags.
384      * Theoretically, checking whether a specified position in the document is enclosed in another tag is only possible if every preceding tag has been parsed,
385      * otherwise it is impossible to tell whether one of the delimiters of the enclosing tag was in fact enclosed by some other tag before it, thereby invalidating it.
386      * <p>
387      * When this method is called, each tag is parsed in sequence starting from the beginning of the document, making it easy to check whether each potential
388      * tag is in a valid position.
389      * In <i>parse on demand</i> mode a compromise technique must be used for this check, since the theoretical requirement of having parsed all preceding tags
390      * is no longer practical.
391      * This compromise involves only checking whether the position is enclosed by other tags with {@linkplain TagType#getTagTypesIgnoringEnclosedMarkup() certain tag types}.
392      * The added complexity of this technique makes parsing each tag slower compared to when a full sequential parse is performed, but when only a few tags need
393      * parsing this is an extremely beneficial trade-off.
394      * <p>
395      * The documentation of the {@link TagType#isValidPosition(Source, int pos)} method, which is called internally by the parser to perform the valid position check,
396      * includes a more detailed explanation of the differences between the two modes of operation.
397      * <p>
398      * If the {@link #findAllTags()}, {@link #findAllStartTags()} or {@link #findAllElements()} method is called on the <code>Source</code> object
399      * without having called this method first, a {@linkplain #setLogWriter(Writer) log} message is generated recommending its use.
400      * <p>
401      * This method returns the same list of tags as the {@link Source#findAllTags() Source.findAllTags()} method, but as an array instead of a list.
402      * <p>
403      * If this method is called after any of the <a HREF="Tag.html#TagSearchMethods">tag search methods</a> are called,
404      * the {@linkplain #getCacheDebugInfo() cache} is cleared of any previously found tags before being restocked via the full sequential parse.
405      * This is significant if the {@link Segment#ignoreWhenParsing()} method has been called since the tags were first found, as any tags inside the
406      * ignored segments will no longer be returned by any of the <a HREF="Tag.html#TagSearchMethods">tag search methods</a>.
407      * <p>
408      * See also the {@link Tag} class documentation for more general details about how tags are parsed.
409      *
410      * @return an array of all {@linkplain Tag tags} in this source document.
411      */

412     public Tag[] fullSequentialParse() {
413         // The assumeNoNestedTags flag tells the parser not to bother checking for tags inside other tags
414
// if the user knows that the document doesn't contain any server tags.
415
// This results in a more efficient search, but the difference during benchmark tests was only minimal -
416
// about 12% speed improvement in a 1MB document containing 70,000 tags, 75% of which were inside a comment tag.
417
// With such a small improvement in a document specifically designed to show an an exaggerated improvement,
418
// it is not worth documenting this feature.
419
// The flag has been retained internally however as it does not have a measurable performance impact to check for it.
420
final boolean assumeNoNestedTags=false;
421         if (cache.getTagCount()!=0) cache.clear();
422         final boolean useAllTypesCacheSave=useAllTypesCache;
423         try {
424             useAllTypesCache=false;
425             useSpecialTypesCache=false;
426             return Tag.parseAll(this,assumeNoNestedTags);
427         } finally {
428             useAllTypesCache=useAllTypesCacheSave;
429             useSpecialTypesCache=true;
430             endOfLastTagIgnoringEnclosedMarkup=-1;
431         }
432     }
433
434     /**
435      * Returns a list of the top-level {@linkplain Element elements} in the document element hierarchy.
436      * <p>
437      * The {@link Source#fullSequentialParse()} method should be called after construction of the <code>Source</code> object if this method is to be used.
438      * <p>
439      * The objects in the list are all of type {@link Element}.
440      * <p>
441      * The term <i><a name="TopLevelElement">top-level element</a></i> refers to an element that is not nested within any other element in the document.
442      * <p>
443      * The term <i><a name="DocumentElementHierarchy">document element hierarchy</a></i> refers to the hierarchy of elements that make up this source document.
444      * While the document itself is theoretically at the top of the hierarchy, this library only considers {@link Element} objects to be part of the hierarchy,
445      * so the top-level elements are the immediate children of the source document.
446      * <p>
447      * The {@link Element#getChildElements()} method can be used to get the decendents of the top-level elements.
448      * <p>
449      * The document element hierarchy differs from that of the <a target="_blank" HREF="http://en.wikipedia.org/wiki/Document_Object_Model">Document Object Model</a>
450      * in that it is only a representation of the elements that are physically present in the source text. Unlike the DOM, it does not include any "implied" HTML elements
451      * such as {@link HTMLElementName#TBODY TBODY} if they are not present in the source text.
452      * <p>
453      * Elements formed from {@linkplain TagType#isServerTag() server tags} are not included in the hierarchy at all.
454      * <p>
455      * Structural errors in this source document such as overlapping elements are reported in the {@linkplain #setLogWriter(Writer) log}.
456      * In the case that two elements are found to overlap, the position of the start tag determines the location of the element in the hierarchy.
457      * <p>
458      * A visual representation of the document element hierarchy can be obtained by calling
459      * {@link #indent(String,boolean,boolean,boolean) indent("&nbsp;&nbsp;",true,true,true)}.
460      *
461      * @return a list of the top-level {@linkplain Element elements} in the document element hierarchy, guaranteed not <code>null</code>.
462      * @see Element#getParentElement()
463      * @see Element#getChildElements()
464      * @see Element#getDepth()
465      */

466     public List getChildElements() {
467         if (childElements==null) {
468             if (length()==0) {
469                 childElements=Collections.EMPTY_LIST;
470             } else {
471                 if (allTags==null) log("NOTE: Calling Source.fullSequentialParse() can significantly improve the performance of this operation");
472                 childElements=new ArrayList();
473                 int pos=0;
474                 while (true) {
475                     final StartTag childStartTag=source.findNextStartTag(pos);
476                     if (childStartTag==null) break;
477                     if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
478                         pos=childStartTag.end;
479                         continue;
480                     }
481                     final Element childElement=childStartTag.getElement();
482                     childElement.parentElement=null;
483                     childElements.add(childElement);
484                     childElement.getChildElements(0);
485                     pos=childElement.end;
486                 }
487             }
488         }
489         return childElements;
490     }
491
492     /**
493      * Returns a list of all {@linkplain Tag tags} in this source document.
494      * <p>
495      * The {@link #fullSequentialParse()} method should be called after construction of the <code>Source</code> object if this method is to be used.
496      * <p>
497      * See the {@link Tag} class documentation for more details about the behaviour of this method.
498      *
499      * @return a list of all {@linkplain Tag tags} in this source document.
500      */

501     public List findAllTags() {
502         if (allTags==null) {
503             log("NOTE: Calling Source.fullSequentialParse() can significantly improve the performance of this operation");
504             allTags=super.findAllTags();
505         }
506         return allTags;
507     }
508
509     /**
510      * Returns a list of all {@linkplain StartTag start tags} in this source document.
511      * <p>
512      * The {@link #fullSequentialParse()} method should be called after construction of the <code>Source</code> object if this method is to be used.
513      * <p>
514      * See the {@link Tag} class documentation for more details about the behaviour of this method.
515      *
516      * @return a list of all {@linkplain StartTag start tags} in this source document.
517      */

518     public List findAllStartTags() {
519         if (allStartTags==null) {
520             final List allTags=findAllTags();
521             allStartTags=new ArrayList(allTags.size());
522             for (final Iterator i=allTags.iterator(); i.hasNext();) {
523                 final Object JavaDoc next=i.next();
524                 if (next instanceof StartTag) allStartTags.add(next);
525             }
526         }
527         return allStartTags;
528     }
529
530     /**
531      * Returns a list of all {@linkplain Element elements} in this source document.
532      * <p>
533      * The {@link #fullSequentialParse()} method should be called after construction of the <code>Source</code> object if this method is to be used.
534      * <p>
535      * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags()} method.
536      *
537      * @return a list of all {@linkplain Element elements} in this source document.
538      */

539     public List findAllElements() {
540         if (allElements==null) {
541             final List allStartTags=findAllStartTags();
542             if (allStartTags.isEmpty()) return Collections.EMPTY_LIST;
543             allElements=new ArrayList(allStartTags.size());
544             for (final Iterator i=allStartTags.iterator(); i.hasNext();) {
545                 final StartTag startTag=(StartTag)i.next();
546                 allElements.add(startTag.getElement());
547             }
548         }
549         return allElements;
550     }
551
552     /**
553      * Returns the {@link Element} with the specified <code>id</code> attribute value.
554      * <p>
555      * This simulates the script method
556      * <code><a target="_blank" HREF="http://www.w3.org/TR/1998/REC-DOM-Level-1-19981001/level-one-html.html#ID-36113835">getElementById</a></code>
557      * defined in DOM HTML level 1.
558      * <p>
559      * This is equivalent to {@link #findNextStartTag(int,String,String,boolean) findNextStartTag}<code>(0,"id",id,true).</code>{@link StartTag#getElement() getElement()}, assuming that the element exists.
560      * <p>
561      * A well formed HTML document should have no more than one element with any given <code>id</code> attribute value.
562      *
563      * @param id the <code>id</code> attribute value (case sensitive) to search for, must not be <code>null</code>.
564      * @return the {@link Element} with the specified <code>id</code> attribute value, or <code>null</code> if no such element exists.
565      */

566     public Element getElementById(final String JavaDoc id) {
567         final StartTag startTag=findNextStartTag(0,Attribute.ID,id,true);
568         return startTag==null ? null : startTag.getElement();
569     }
570
571     /**
572      * Returns the {@link Tag} at the specified position in the source document.
573      * <p>
574      * See the {@link Tag} class documentation for more details about the behaviour of this method.
575      * <p>
576      * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags.
577      *
578      * @param pos the position in the source document, may be out of bounds.
579      * @return the {@link Tag} at the specified position in the source document, or <code>null</code> if no tag exists at the specified position or it is out of bounds.
580      */

581     public final Tag getTagAt(final int pos) {
582         return Tag.getTagAt(this,pos);
583     }
584
585     /**
586      * Returns the {@link Tag} beginning at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
587      * <p>
588      * See the {@link Tag} class documentation for more details about the behaviour of this method.
589      *
590      * @param pos the position in the source document from which to start the search, may be out of bounds.
591      * @return the {@link Tag} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
592      */

593     public Tag findPreviousTag(final int pos) {
594         return Tag.findPreviousOrNextTag(this,pos,true);
595     }
596
597     /**
598      * Returns the {@link Tag} of the specified {@linkplain TagType type} beginning at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
599      * <p>
600      * See the {@link Tag} class documentation for more details about the behaviour of this method.
601      *
602      * @param pos the position in the source document from which to start the search, may be out of bounds.
603      * @param tagType the <code>TagType</code> to search for.
604      * @return the {@link Tag} with the specified {@linkplain TagType type} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
605      */

606     public Tag findPreviousTag(final int pos, final TagType tagType) {
607         return Tag.findPreviousOrNextTag(this,pos,tagType,true);
608     }
609     
610     /**
611      * Returns the {@link Tag} beginning at or immediately following the specified position in the source document.
612      * <p>
613      * See the {@link Tag} class documentation for more details about the behaviour of this method.
614      *
615      * @param pos the position in the source document from which to start the search, may be out of bounds.
616      * @return the {@link Tag} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
617      */

618     public Tag findNextTag(final int pos) {
619         return Tag.findPreviousOrNextTag(this,pos,false);
620     }
621
622     /**
623      * Returns the {@link Tag} of the specified {@linkplain TagType type} beginning at or immediately following the specified position in the source document.
624      * <p>
625      * See the {@link Tag} class documentation for more details about the behaviour of this method.
626      *
627      * @param pos the position in the source document from which to start the search, may be out of bounds.
628      * @param tagType the <code>TagType</code> to search for.
629      * @return the {@link Tag} with the specified {@linkplain TagType type} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
630      */

631     public Tag findNextTag(final int pos, final TagType tagType) {
632         return Tag.findPreviousOrNextTag(this,pos,tagType,false);
633     }
634
635     /**
636      * Returns the {@link Tag} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
637      * <p>
638      * See the {@link Tag} class documentation for more details about the behaviour of this method.
639      *
640      * @param pos the position in the source document, may be out of bounds.
641      * @return the {@link Tag} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within a tag or is out of bounds.
642      */

643     public Tag findEnclosingTag(final int pos) {
644         return findEnclosingTag(pos,null);
645     }
646
647     /**
648      * Returns the {@link Tag} of the specified {@linkplain TagType type} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
649      * <p>
650      * See the {@link Tag} class documentation for more details about the behaviour of this method.
651      *
652      * @param pos the position in the source document, may be out of bounds.
653      * @param tagType the <code>TagType</code> to search for.
654      * @return the {@link Tag} of the specified {@linkplain TagType type} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within a tag of the specified type or is out of bounds.
655      */

656     public Tag findEnclosingTag(final int pos, final TagType tagType) {
657         final Tag tag=findPreviousTag(pos,tagType);
658         if (tag==null || tag.end<=pos) return null;
659         return tag;
660     }
661
662     /**
663      * Returns the {@link Element} beginning at or immediately following the specified position in the source document.
664      * <p>
665      * This is equivalent to {@link #findNextStartTag(int) findNextStartTag(pos)}<code>.</code>{@link StartTag#getElement() getElement()},
666      * assuming the result is not <code>null</code>.
667      *
668      * @param pos the position in the source document from which to start the search, may be out of bounds.
669      * @return the {@link Element} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
670      */

671     public Element findNextElement(final int pos) {
672         final StartTag startTag=findNextStartTag(pos);
673         return startTag==null ? null : startTag.getElement();
674     }
675
676     /**
677      * Returns the {@link Element} with the specified {@linkplain Element#getName() name} beginning at or immediately following the specified position in the source document.
678      * <p>
679      * This is equivalent to {@link #findNextStartTag(int,String) findNextStartTag(pos,name)}<code>.</code>{@link StartTag#getElement() getElement()},
680      * assuming the result is not <code>null</code>.
681      * <p>
682      * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to
683      * {@link #findNextStartTag(int) findNextElement(pos)}.
684      * <p>
685      * Specifying an argument to the <code>name</code> parameter that ends in a colon (<code>:</code>) searches for all elements
686      * in the specified XML namespace.
687      * <p>
688      * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
689      *
690      * @param pos the position in the source document from which to start the search, may be out of bounds.
691      * @param name the {@linkplain Element#getName() name} of the element to search for.
692      * @return the {@link Element} with the specified {@linkplain Element#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
693      */

694     public Element findNextElement(final int pos, String JavaDoc name) {
695         final StartTag startTag=findNextStartTag(pos,name);
696         return startTag==null ? null : startTag.getElement();
697     }
698
699     /**
700      * Returns the {@link StartTag} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
701      * <p>
702      * See the {@link Tag} class documentation for more details about the behaviour of this method.
703      *
704      * @param pos the position in the source document from which to start the search, may be out of bounds.
705      * @return the {@link StartTag} at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
706      */

707     public StartTag findPreviousStartTag(final int pos) {
708         return StartTag.findPreviousOrNext(this,pos,true);
709     }
710
711     /**
712      * Returns the {@link StartTag} with the specified {@linkplain StartTag#getName() name} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
713      * <p>
714      * See the {@link Tag} class documentation for more details about the behaviour of this method.
715      * <p>
716      * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to
717      * {@link #findPreviousStartTag(int) findPreviousStartTag(pos)}.
718      * <p>
719      * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
720      *
721      * @param pos the position in the source document from which to start the search, may be out of bounds.
722      * @param name the {@linkplain StartTag#getName() name} of the start tag to search for.
723      * @return the {@link StartTag} with the specified {@linkplain StartTag#getName() name} at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
724      */

725     public StartTag findPreviousStartTag(final int pos, String JavaDoc name) {
726         if (name!=null) name=name.toLowerCase();
727         final boolean isXMLTagName=Tag.isXMLName(name);
728         return StartTag.findPreviousOrNext(this,pos,name,isXMLTagName,true);
729     }
730
731     /**
732      * Returns the {@link StartTag} beginning at or immediately following the specified position in the source document.
733      * <p>
734      * See the {@link Tag} class documentation for more details about the behaviour of this method.
735      *
736      * @param pos the position in the source document from which to start the search, may be out of bounds.
737      * @return the {@link StartTag} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
738      */

739     public StartTag findNextStartTag(final int pos) {
740         return StartTag.findPreviousOrNext(this,pos,false);
741     }
742
743     /**
744      * Returns the {@link StartTag} with the specified {@linkplain StartTag#getName() name} beginning at or immediately following the specified position in the source document.
745      * <p>
746      * See the {@link Tag} class documentation for more details about the behaviour of this method.
747      * <p>
748      * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to
749      * {@link #findNextStartTag(int) findNextStartTag(pos)}.
750      * <p>
751      * Specifying an argument to the <code>name</code> parameter that ends in a colon (<code>:</code>) searches for all start tags
752      * in the specified XML namespace.
753      * <p>
754      * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
755      *
756      * @param pos the position in the source document from which to start the search, may be out of bounds.
757      * @param name the {@linkplain StartTag#getName() name} of the start tag to search for.
758      * @return the {@link StartTag} with the specified {@linkplain StartTag#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
759      */

760     public StartTag findNextStartTag(final int pos, String JavaDoc name) {
761         if (name!=null) name=name.toLowerCase();
762         final boolean isXMLTagName=Tag.isXMLName(name);
763         return StartTag.findPreviousOrNext(this,pos,name,isXMLTagName,false);
764     }
765
766     /**
767      * Returns the {@link StartTag} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
768      * <p>
769      * See the {@link Tag} class documentation for more details about the behaviour of this method.
770      *
771      * @param pos the position in the source document from which to start the search, may be out of bounds.
772      * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
773      * @param value the value of the specified attribute to search for, must not be <code>null</code>.
774      * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
775      * @return the {@link StartTag} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
776      */

777     public StartTag findNextStartTag(final int pos, final String JavaDoc attributeName, final String JavaDoc value, final boolean valueCaseSensitive) {
778         return StartTag.findNext(this,pos,attributeName,value,valueCaseSensitive);
779     }
780
781     /**
782      * Returns the {@link EndTag} beginning at or immediately preceding the specified position in the source document.
783      * <p>
784      * See the {@link Tag} class documentation for more details about the behaviour of this method.
785      *
786      * @param pos the position in the source document from which to start the search, may be out of bounds.
787      * @return the {@link EndTag} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
788      */

789     public EndTag findPreviousEndTag(final int pos) {
790         return EndTag.findPreviousOrNext(this,pos,true);
791     }
792
793     /**
794      * Returns the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
795      * <p>
796      * See the {@link Tag} class documentation for more details about the behaviour of this method.
797      *
798      * @param pos the position in the source document from which to start the search, may be out of bounds.
799      * @param name the {@linkplain StartTag#getName() name} of the end tag to search for, must not be <code>null</code>.
800      * @return the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
801      */

802     public EndTag findPreviousEndTag(final int pos, final String JavaDoc name) {
803         if (name==null) throw new IllegalArgumentException JavaDoc("name argument must not be null");
804         return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),EndTagType.NORMAL,true);
805     }
806
807     /**
808      * Returns the {@link EndTag} beginning at or immediately following the specified position in the source document.
809      * <p>
810      * See the {@link Tag} class documentation for more details about the behaviour of this method.
811      *
812      * @param pos the position in the source document from which to start the search, may be out of bounds.
813      * @return the {@link EndTag} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
814      */

815     public EndTag findNextEndTag(final int pos) {
816         return EndTag.findPreviousOrNext(this,pos,false);
817     }
818
819     /**
820      * Returns the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} beginning at or immediately following the specified position in the source document.
821      * <p>
822      * See the {@link Tag} class documentation for more details about the behaviour of this method.
823      *
824      * @param pos the position in the source document from which to start the search, may be out of bounds.
825      * @param name the {@linkplain StartTag#getName() name} of the end tag to search for, must not be <code>null</code>.
826      * @return the {@linkplain EndTagType#NORMAL normal} {@link EndTag} with the specified {@linkplain EndTag#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
827      */

828     public EndTag findNextEndTag(final int pos, final String JavaDoc name) {
829         return findNextEndTag(pos,name,EndTagType.NORMAL);
830     }
831
832     /**
833      * Returns the {@link EndTag} with the specified {@linkplain EndTag#getName() name} and {@linkplain EndTagType type} beginning at or immediately following the specified position in the source document.
834      * <p>
835      * See the {@link Tag} class documentation for more details about the behaviour of this method.
836      *
837      * @param pos the position in the source document from which to start the search, may be out of bounds.
838      * @param name the {@linkplain StartTag#getName() name} of the end tag to search for, must not be <code>null</code>.
839      * @param endTagType the {@linkplain EndTagType type} of the end tag to search for, must not be <code>null</code>.
840      * @return the {@link EndTag} with the specified {@linkplain EndTag#getName() name} and {@linkplain EndTagType type} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
841      */

842     public EndTag findNextEndTag(final int pos, final String JavaDoc name, final EndTagType endTagType) {
843         if (name==null) throw new IllegalArgumentException JavaDoc("name argument must not be null");
844         return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),endTagType,false);
845     }
846
847     /**
848      * Returns the most nested {@link Element} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
849      * <p>
850      * The specified position can be anywhere inside the {@linkplain Element#getStartTag() start tag}, {@linkplain Element#getEndTag() end tag},
851      * or {@linkplain Element#getContent() content} of the element. There is no requirement that the returned element has an end tag, and it
852      * may be a {@linkplain TagType#isServerTag() server tag} or HTML {@linkplain StartTagType#COMMENT comment}.
853      * <p>
854      * See the {@link Tag} class documentation for more details about the behaviour of this method.
855      *
856      * @param pos the position in the source document, may be out of bounds.
857      * @return the most nested {@link Element} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within an element or is out of bounds.
858      */

859     public Element findEnclosingElement(final int pos) {
860         return findEnclosingElement(pos,null);
861     }
862
863     /**
864      * Returns the most nested {@link Element} with the specified {@linkplain Element#getName() name} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
865      * <p>
866      * The specified position can be anywhere inside the {@linkplain Element#getStartTag() start tag}, {@linkplain Element#getEndTag() end tag},
867      * or {@linkplain Element#getContent() content} of the element. There is no requirement that the returned element has an end tag, and it
868      * may be a {@linkplain TagType#isServerTag() server tag} or HTML {@linkplain StartTagType#COMMENT comment}.
869      * <p>
870      * See the {@link Tag} class documentation for more details about the behaviour of this method.
871      * <p>
872      * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
873      *
874      * @param pos the position in the source document, may be out of bounds.
875      * @param name the {@linkplain Element#getName() name} of the element to search for.
876      * @return the most nested {@link Element} with the specified {@linkplain Element#getName() name} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
877      */

878     public Element findEnclosingElement(final int pos, String JavaDoc name) {
879         int startBefore=pos;
880         if (name!=null) name=name.toLowerCase();
881         final boolean isXMLTagName=Tag.isXMLName(name);
882         while (true) {
883             StartTag startTag=StartTag.findPreviousOrNext(this,startBefore,name,isXMLTagName,true);
884             if (startTag==null) return null;
885             Element element=startTag.getElement();
886             if (pos < element.end) return element;
887             startBefore=startTag.begin-1;
888         }
889     }
890
891     /**
892      * Returns the {@link CharacterReference} at or immediately preceding (or {@linkplain Segment#encloses(int) enclosing}) the specified position in the source document.
893      * <p>
894      * Character references positioned within an HTML {@linkplain StartTagType#COMMENT comment} are <b>NOT</b> ignored.
895      *
896      * @param pos the position in the source document from which to start the search, may be out of bounds.
897      * @return the {@link CharacterReference} beginning at or immediately preceding the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
898      */

899     public CharacterReference findPreviousCharacterReference(final int pos) {
900         return CharacterReference.findPreviousOrNext(this,pos,true);
901     }
902
903     /**
904      * Returns the {@link CharacterReference} beginning at or immediately following the specified position in the source document.
905      * <p>
906      * Character references positioned within an HTML {@linkplain StartTagType#COMMENT comment} are <b>NOT</b> ignored.
907      *
908      * @param pos the position in the source document from which to start the search, may be out of bounds.
909      * @return the {@link CharacterReference} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds.
910      */

911     public CharacterReference findNextCharacterReference(final int pos) {
912         return CharacterReference.findPreviousOrNext(this,pos,false);
913     }
914
915     /**
916      * Returns the end position of the <a target="_blank" HREF="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a> that starts at the
917      * specified position.
918      * <p>
919      * This implementation first checks that the character at the specified position is a valid XML Name start character as defined by the
920      * {@link Tag#isXMLNameStartChar(char)} method. If this is not the case, the value <code>-1</code> is returned.
921      * <p>
922      * Once the first character has been checked, subsequent characters are checked using the {@link Tag#isXMLNameChar(char)} method until
923      * one is found that is not a valid XML Name character or the end of the document is reached. This position is then returned.
924      *
925      * @param pos the position in the source document of the first character of the XML Name.
926      * @return the end position of the <a target="_blank" HREF="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a> that starts at the specified position.
927      * @throws IndexOutOfBoundsException if the specified position is not within the bounds of the document.
928      */

929     public int findNameEnd(int pos) {
930         if (!Tag.isXMLNameStartChar(string.charAt(pos++))) return -1;
931         while (pos<string.length() && Tag.isXMLNameChar(string.charAt(pos))) pos++;
932         return pos;
933     }
934
935     /**
936      * Parses any {@link Attributes} starting at the specified position.
937      * This method is only used in the unusual situation where attributes exist outside of a start tag.
938      * The {@link StartTag#getAttributes()} method should be used in normal situations.
939      * <p>
940      * The returned Attributes segment always begins at <code>pos</code>,
941      * and ends at the end of the last attribute before either <code>maxEnd</code> or
942      * the first occurrence of "/&gt;" or "&gt;" outside of a quoted attribute value, whichever comes first.
943      * <p>
944      * Only returns <code>null</code> if the segment contains a major syntactical error
945      * or more than the {@linkplain Attributes#getDefaultMaxErrorCount() default maximum} number of
946      * minor syntactical errors.
947      * <p>
948      * This is equivalent to
949      * {@link #parseAttributes(int,int,int) parseAttributes}<code>(pos,maxEnd,</code>{@link Attributes#getDefaultMaxErrorCount()}<code>)}</code>.
950      *
951      * @param pos the position in the source document at the beginning of the attribute list, may be out of bounds.
952      * @param maxEnd the maximum end position of the attribute list, or -1 if no maximum.
953      * @return the {@link Attributes} starting at the specified position, or <code>null</code> if too many errors occur while parsing or the specified position is out of bounds.
954      * @see StartTag#getAttributes()
955      * @see Segment#parseAttributes()
956      */

957     public Attributes parseAttributes(final int pos, final int maxEnd) {
958         return parseAttributes(pos,maxEnd,Attributes.getDefaultMaxErrorCount());
959     }
960
961     /**
962      * Parses any {@link Attributes} starting at the specified position.
963      * This method is only used in the unusual situation where attributes exist outside of a start tag.
964      * The {@link StartTag#getAttributes()} method should be used in normal situations.
965      * <p>
966      * Only returns <code>null</code> if the segment contains a major syntactical error
967      * or more than the specified number of minor syntactical errors.
968      * <p>
969      * The <code>maxErrorCount</code> argument overrides the {@linkplain Attributes#getDefaultMaxErrorCount() default maximum error count}.
970      * <p>
971      * See {@link #parseAttributes(int pos, int maxEnd)} for more information.
972      *
973      * @param pos the position in the source document at the beginning of the attribute list, may be out of bounds.
974      * @param maxEnd the maximum end position of the attribute list, or -1 if no maximum.
975      * @param maxErrorCount the maximum number of minor errors allowed while parsing.
976      * @return the {@link Attributes} starting at the specified position, or <code>null</code> if too many errors occur while parsing or the specified position is out of bounds.
977      * @see StartTag#getAttributes()
978      * @see #parseAttributes(int pos, int MaxEnd)
979      */

980     public Attributes parseAttributes(final int pos, final int maxEnd, final int maxErrorCount) {
981         return Attributes.construct(this,pos,maxEnd,maxErrorCount);
982     }
983
984     /**
985      * Causes the specified range of the source text to be ignored when parsing.
986      * <p>
987      * See the documentation of the {@link Segment#ignoreWhenParsing()} method for more information.
988      *
989      * @param begin the beginning character position in the source text.
990      * @param end the end character position in the source text.
991      */

992     public void ignoreWhenParsing(final int begin, final int end) {
993         if (parseTextOutputDocument==null) {
994             parseTextOutputDocument=new OutputDocument(getParseText());
995             parseText=null;
996         }
997         parseTextOutputDocument.replaceWithSpaces(begin,end);
998     }
999
1000    /**
1001     * Causes all of the segments in the specified collection to be ignored when parsing.
1002     * <p>
1003     * This is equivalent to calling {@link Segment#ignoreWhenParsing()} on each segment in the collection.
1004     */

1005    public void ignoreWhenParsing(final Collection segments) {
1006        for (final Iterator i=segments.iterator(); i.hasNext();) {
1007            ((Segment)i.next()).ignoreWhenParsing();
1008        }
1009    }
1010
1011    /**
1012     * Reproduces the source text with indenting that represents the <a HREF="#DocumentElementHierarchy">document element hierarchy</a> of this source document.
1013     * Any indenting present in the original source text is removed.
1014     * <p>
1015     * The output text is functionally equivalent to the original source and should be rendered identically unless specified below.
1016     * <p>
1017     * The following points describe the process in general terms.
1018     * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
1019     * <p>
1020     * <ul>
1021     * <li>Every element that is not an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level element} appears on a new line
1022     * with an indent corresponding to its {@linkplain Element#getDepth() depth} in the <a HREF="#DocumentElementHierarchy">document element hierarchy</a>.
1023     * <li>The indent is formed by writing <i>n</i> repetitions of the string specified in the <code>indentText</code> argument,
1024     * where <i>n</i> is the depth of the indent.
1025     * <li>The {@linkplain Element#getContent() content} of an indented element starts on a new line and is indented at a depth one greater than that of the element,
1026     * with the end tag appearing on a new line at the same depth as the start tag.
1027     * If the content contains only text and {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements},
1028     * it may continue on the same line as the start tag. Additionally, if the output content contains no new lines, the end tag may also continue on the same line.
1029     * <li>The content of preformatted elements such as {@link HTMLElementName#PRE PRE} and {@link HTMLElementName#TEXTAREA TEXTAREA} are not indented,
1030     * nor is the white space modified in any way.
1031     * <li>Only {@linkplain StartTagType#NORMAL normal} and {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} elements are indented.
1032     * All others are treated as {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
1033     * <li>White space and indenting inside HTML {@linkplain StartTagType#COMMENT comments}, {@linkplain StartTagType#CDATA_SECTION CDATA sections}, or any
1034     * {@linkplain TagType#isServerTag() server tag} is preserved,
1035     * but with the indenting of new lines starting at a depth one greater than that of the surrounding text.
1036     * <li>White space and indenting inside {@link HTMLElementName#SCRIPT SCRIPT} elements is preserved,
1037     * but with the indenting of new lines starting at a depth one greater than that of the <code>SCRIPT</code> element.
1038     * <li>If the <code>tidyTags</code> option is used, every tag in the document is replaced with the output from its {@link Tag#tidy()} method.
1039     * If this argument is set to <code>false</code>, the tag from the original text is used, including all white space,
1040     * but with any new lines indented at a depth one greater than that of the element.
1041     * <li>If the <code>collapseWhiteSpace</code> option is used, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
1042     * located outside of a tag is replaced with a single space in the output.
1043     * White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
1044     * <li>If the <code>indentAllElements</code> option is used, every element appears indented on a new line, including
1045     * {@linkplain Element#isInline(String) inline-level elements}.
1046     * This generates output that is a good representation of the actual <a HREF="#DocumentElementHierarchy">document element hierarchy</a>,
1047     * but is very likely to introduce white space that affects the functional equivalency of the document.
1048     * <li>If the source document contains {@linkplain TagType#isServerTag() server tags}, the functional equivalency of the output document may be compromised.
1049     * </ul>
1050     * <p>
1051     * Use one of the following methods to obtain the output from the returned {@link CharStreamSource} object:<br />
1052     * {@link CharStreamSource#writeTo(Writer)}<br />
1053     * {@link CharStreamSourceUtil#toString(CharStreamSource)}<br />
1054     * {@link CharStreamSourceUtil#getReader(CharStreamSource)}
1055     *
1056     * @param indentText the string to use for each indent, must not be <code>null</code>.
1057     * @param tidyTags specifies whether to replace the original text of each tag with the output from its {@link Tag#tidy()} method.
1058     * @param collapseWhiteSpace specifies whether to collapse the white space in the text between the tags.
1059     * @param indentAllElements specifies whether to indent all elements, including {@linkplain Element#isInline(String) inline-level elements} and those with preformatted contents.
1060     * @return a {@link CharStreamSource} from which an indented copy of this source document can be obtained.
1061     */

1062    public CharStreamSource indent(final String JavaDoc indentText, final boolean tidyTags, final boolean collapseWhiteSpace, final boolean indentAllElements) {
1063        return new Indent(this,indentText,tidyTags,collapseWhiteSpace,indentAllElements);
1064    }
1065
1066    /**
1067     * Returns the destination <code>Writer</code> for log messages.
1068     * <p>
1069     * By default, the log writer is set to <code>null</code>, which supresses log messages.
1070     *
1071     * @return the destination <code>Writer</code> for log messages.
1072     */

1073    public Writer getLogWriter() {
1074        return logWriter;
1075    }
1076
1077    /**
1078     * Sets the destination <code>Writer</code> for log messages.
1079     * <p>
1080     * When required, this method should normally be called immediately after the construction of the <code>Source</code> object.
1081     *
1082     * @param writer the destination <code>java.io.Writer</code> for log messages.
1083     * @see #getLogWriter()
1084     */

1085    public void setLogWriter(final Writer writer) {
1086        logWriter=writer;
1087    }
1088
1089    /**
1090     * Indicates whether logging is currently enabled.
1091     * <p>
1092     * The current implementation of this method is equivalent to {@link #getLogWriter()}<code>!=null</code>.
1093     * <p>
1094     * For best performance you should check that this method returns <code>true</code> before constructing the string to send to
1095     * the {@link #log(String message)} method.
1096     *
1097     * @return <code>true</code> if logging is currently enabled, otherwise <code>false</code>.
1098     */

1099    public boolean isLoggingEnabled() {
1100        return logWriter!=null;
1101    }
1102
1103    /**
1104     * Writes the specified message to the log.
1105     * <p>
1106     * The log destination is set via the {@link #setLogWriter(Writer)} method.
1107     * By default, log messages are not sent anywhere.
1108     * <p>
1109     * A newline character is added to the message and the <code>Writer</code> is flushed after every call to this method.
1110     * <p>
1111     * If an <code>IOException</code> is thrown while writing to the log, this method throws a <code>RuntimeException</code> with
1112     * the original <code>IOException</code> as its cause.
1113     *
1114     * @param message the message to log
1115     */

1116    public void log(final String JavaDoc message) {
1117        if (logWriter==null) return;
1118        try {
1119            logWriter.write(message);
1120            logWriter.write('\n');
1121            logWriter.flush();
1122        } catch (IOException ex) {
1123            throw new RuntimeException JavaDoc(ex);
1124        }
1125    }
1126
1127    /**
1128     * Clears the {@linkplain #getCacheDebugInfo() tag cache} of all tags.
1129     * <p>
1130     * This method may be useful after calling the {@link Segment#ignoreWhenParsing()} method so that any tags previously found within the ignored segments
1131     * will no longer be returned by the <a HREF="Tag.html#TagSearchMethods">tag search methods</a>.
1132     */

1133    public void clearCache() {
1134        cache.clear();
1135        allTagsArray=null;
1136        allTags=null;
1137        allStartTags=null;
1138        allElements=null;
1139    }
1140
1141    /**
1142     * Returns a string representation of the tag cache, useful for debugging purposes.
1143     * @return a string representation of the tag cache, useful for debugging purposes.
1144     */

1145    public String JavaDoc getCacheDebugInfo() {
1146        return cache.toString();
1147    }
1148
1149    /**
1150     * Gets a list of all the tags that have been parsed so far.
1151     * <p>
1152     * This information may be useful for debugging purposes.
1153     * Execution of this method collects information from the internal cache and is relatively expensive.
1154     *
1155     * @return a list of all the tags that have been parsed so far.
1156     * @see #getCacheDebugInfo()
1157     */

1158    List getParsedTags() {
1159        final ArrayList list=new ArrayList();
1160        for (final Iterator i=cache.getTagIterator(); i.hasNext();) list.add(i.next());
1161        return list;
1162    }
1163
1164    /**
1165     * Returns the {@linkplain ParseText parse text} of this source document.
1166     * <p>
1167     * This method is normally only of interest to users who wish to create <a HREF="TagType.html#Custom">custom tag types</a>.
1168     * <p>
1169     * The parse text is defined as the entire text of the source document in lower case, with all
1170     * {@linkplain Segment#ignoreWhenParsing() ignored} segments replaced by space characters.
1171     *
1172     * @return the {@linkplain ParseText parse text} of this source document.
1173     */

1174    public final ParseText getParseText() {
1175        if (parseText==null) {
1176            if (parseTextOutputDocument!=null) {
1177                parseText=new ParseText(parseTextOutputDocument);
1178                parseTextOutputDocument=null;
1179            } else {
1180                parseText=new ParseText(this);
1181            }
1182        }
1183        return parseText;
1184    }
1185
1186    /**
1187     * Returns the {@link StartTag} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
1188     * <p>
1189     * This method has been deprecated as of version 2.0 in favour of the more generic {@link #findEnclosingTag(int pos)} method.
1190     * <p>
1191     * Caveat - The returned tag from {@link #findEnclosingTag(int pos)} may be an instance of {@link EndTag}.
1192     * In most cases this should be interpreted in the same way as if this method returned a <code>null</code>,
1193     * since an end tag normally does not exist inside of a start tag.
1194     * There is however one situation where this may occur legitimately, where a {@linkplain TagType#isServerTag() server-side} end tag
1195     * appears within a normal start tag.
1196     * It is up to the developer to decide whether this situation requires special handling when updating code that uses this
1197     * deprecated method.
1198     *
1199     * @param pos the position in the source document.
1200     * @return the {@link StartTag} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within a start tag.
1201     * @deprecated Use {@link #findEnclosingTag(int pos)} instead. (see caveat)
1202     */

1203    public StartTag findEnclosingStartTag(final int pos) {
1204        final StartTag startTag=findPreviousStartTag(pos);
1205        if (startTag==null || startTag.end<=pos) return null;
1206        return startTag;
1207    }
1208
1209    /**
1210     * Returns the {@link StartTag} object representing the HTML {@linkplain StartTagType#COMMENT comment} beginning at or immediately following the specified position in the source document.
1211     * <p>
1212     * This method has been deprecated as of version 2.0 in favour of the more generic {@link #findNextTag(int pos, TagType)} method.
1213     *
1214     * @param pos the position in the source document from which to start the search.
1215     * @return the {@link StartTag} object representing the HTML {@linkplain StartTagType#COMMENT comment} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists.
1216     * @deprecated Use {@link #findNextTag(int,TagType) findNextTag}<code>(pos,</code>{@link StartTagType#COMMENT}<code>)</code> instead.
1217     */

1218    public StartTag findNextComment(final int pos) {
1219        return (StartTag)findNextTag(pos,StartTagType.COMMENT);
1220    }
1221
1222    /**
1223     * Returns the <code>Segment</code> object representing the HTML {@linkplain StartTagType#COMMENT comment} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document.
1224     * <p>
1225     * This method has been deprecated as of version 2.0 in favour of the more generic {@link #findEnclosingTag(int pos, TagType)} method.
1226     *
1227     * @param pos the position in the source document.
1228     * @return the <code>Segment</code> object representing the HTML {@linkplain StartTagType#COMMENT comment} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if the position is not within a comment.
1229     * @deprecated Use {@link #findEnclosingTag(int,TagType) findEnclosingTag}<code>(pos,</code>{@link StartTagType#COMMENT}<code>)</code> instead.
1230     */

1231    public Segment findEnclosingComment(final int pos) {
1232        return findEnclosingTag(pos,StartTagType.COMMENT);
1233    }
1234
1235    /**
1236     * Returns an iterator of {@link Tag} objects beginning at and following the specified position in the source document.
1237     * <p>
1238     * This method has been deprecated as of version 2.2 as it was originally only included because it was more efficient than
1239     * consecutive calls to {@link #findNextTag(int pos)}.
1240     * The most efficient replacement is to use multiple calls to {@link Tag#findNextTag()} if a {@linkplain #fullSequentialParse() full sequential parse} was peformed,
1241     * otherwise use {@link #findAllTags()}<code>.iterator()</code> and skip over the tags that begin before
1242     * the position specified in the <code>pos</code> argument of this method.
1243     *
1244     * @param pos the position in the source document from which to start the iteration.
1245     * @return an iterator of {@link Tag} objects beginning at and following the specified position in the source document.
1246     * @deprecated Use {@link #findAllTags()}<code>.iterator()</code> instead, or multiple calls to the {@link Tag#findNextTag()} method.
1247     */

1248    public Iterator getNextTagIterator(final int pos) {
1249        return Tag.getNextTagIterator(this,pos);
1250    }
1251
1252    static String JavaDoc getCharsetParameterFromHttpHeaderValue(final String JavaDoc httpHeaderValue) {
1253        final int charsetParameterPos=httpHeaderValue.toLowerCase().indexOf("charset=");
1254        if (charsetParameterPos==-1) return null;
1255        final int charsetBegin=charsetParameterPos+8;
1256        int charsetEnd=httpHeaderValue.indexOf(';',charsetBegin);
1257        final String JavaDoc charset=(charsetEnd==-1) ? httpHeaderValue.substring(charsetBegin) : httpHeaderValue.substring(charsetBegin,charsetEnd);
1258        return charset.trim();
1259    }
1260}
1261
Popular Tags