TagType


1   // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2   // Version 2.2
3   // Copyright (C) 2006 Martin Jericho
4   // http://sourceforge.net/projects/jerichohtml/
5   //
6   // This library is free software; you can redistribute it and/or
7   // modify it under the terms of the GNU Lesser General Public
8   // License as published by the Free Software Foundation; either
9   // version 2.1 of the License, or (at your option) any later version.
10  // http://www.gnu.org/copyleft/lesser.html
11  //
12  // This library is distributed in the hope that it will be useful,
13  // but WITHOUT ANY WARRANTY; without even the implied warranty of
14  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  // Lesser General Public License for more details.
16  //
17  // You should have received a copy of the GNU Lesser General Public
18  // License along with this library; if not, write to the Free Software
19  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  
21  package au.id.jericho.lib.html;
22  
23  import java.util.*;
24  
25  /**
26   * Defines the syntax for a tag type that can be recognised by the parser.
27   * <p>
28   * This class is the root abstract class common to all tag types, and contains methods to {@linkplain #register() register}
29   * and {@linkplain #deregister() deregister} tag types as well as various methods to aid in their implementation.
30   * <p>
31   * Every tag type is represented by an instance of a class (usually a singleton) that must be a subclass of either 
32   * {@link StartTagType} or {@link EndTagType}.  These two abstract classes, the only direct descendants of this class,
33   * represent the two major classifications under which every tag type exists.
34   * <p>
35   * The term <i><a name="Predefined">predefined tag type</a></i> refers to any of the tag types defined in this library,
36   * including both <a HREF="#Standard">standard</a> and <a HREF="#Extended">extended</a> tag types.
37   * <p>
38   * The term <i><a name="Standard">standard tag type</a></i> refers to any of the tag types represented by instances
39   * in static fields of the {@link StartTagType} and {@link EndTagType} subclasses.
40   * Standard tag types are registered by default, and define the tags most commonly found in HTML documents.
41   * <p>
42   * The term <i><a name="Extended">extended tag type</a></i> refers to any <a HREF="#Predefined">predefined</a> tag type
43   * that is not a <a HREF="#Standard">standard</a> tag type.
44   * The {@link PHPTagTypes} and {@link MasonTagTypes} classes contain extended tag types related to their respective server platforms.
45   * The tag types defined within them must be registered by the user before they are recognised by the parser.
46   * <p>
47   * The term <i><a name="Custom">custom tag type</a></i> refers to any user-defined tag type, or any tag type that is
48   * not a <a HREF="#Predefined">predefined</a> tag type.
49   * <p>
50   * The tag recognition process of the parser gives each tag type a <i><a name="Precedence">precedence</a></i> level,
51   * which is primarily determined by the length of its {@linkplain #getStartDelimiter() start delimiter}.
52   * A tag type with a more specific start delimiter is chosen in preference to one with a less specific start delimiter,
53   * assuming they both share the same prefix.  If two tag types have exactly the same start delimiter, the one which was
54   * {@linkplain #register() registered} later has the higher precedence.
55   * <p>
56   * The two special tag types {@link StartTagType#UNREGISTERED} and {@link EndTagType#UNREGISTERED} represent
57   * tags that do not match the syntax of any other tag type.  They have the lowest <a HREF="#Precedence">precedence</a> 
58   * of all the tag types.  The {@link Tag#isUnregistered()} method provides a detailed explanation of unregistered tags.
59   * <p>
60   * See the documentation of the <a HREF="Tag.html#ParsingProcess">tag parsing process</a> for more information
61   * on how each tag is identified by the parser.
62   * <p>
63   * Note that the standard {@linkplain HTMLElementName HTML element names} do not represent different
64   * tag <i>types</i>.  All standard HTML tags have a tag type of {@link StartTagType#NORMAL} or {@link EndTagType#NORMAL}.
65   * <p>
66   * Apart from the <a HREF="#Registration">registration related</a> methods, all of the methods in this class and its
67   * subclasses relate to the implementation of <a HREF="#Custom">custom tag types</a> and are not relevant to the majority of users 
68   * who just use the <a HREF="#Predefined">predefined tag types</a>.
69   * <p>
70   * For perfomance reasons, this library only allows tag types that {@linkplain #getStartDelimiter() start}
71   * with a '<code>&lt;</code>' character.
72   * The character following this defines the immediate subclass of the tag type.
73   * An {@link EndTagType} always has a slash ('<code>/</code>') as the second character, while a {@link StartTagType}
74   * has any character other than a slash as the second character.
75   * This definition means that tag types which are not intuitively classified as either start tag types or end tag types
76   * (such as an HTML {@linkplain StartTagType#COMMENT comment}) are mostly classified as start tag types.
77   * <p>
78   * Every method in this and the {@link StartTagType} and {@link EndTagType} abstract classes can be categorised
79   * as one of the following:
80   * <dl>
81   *  <dt><a name="Property">Properties:</a>
82   *   <dd>Simple properties (marked final) that were either specified as parameters
83   *    during construction or are derived from those parameters.
84   *  <dt><a name="AbstractImplementation">Abstract implementation methods:</a>
85   *   <dd>Methods that must be implemented in a subclass.
86   *  <dt><a name="DefaultImplementation">Default implementation methods:</a>
87   *   <dd>Methods (not marked final) that implement common behaviour, but may be overridden in a subclass.
88   *  <dt><a name="ImplementationAssistance">Implementation assistance methods:</a>
89   *   <dd>Protected methods that provide low-level functionality and are only of use within other implementation methods.
90   *  <dt><a name="RegistrationRelated">Registration related methods:</a>
91   *   <dd>Utility methods (marked final) relating to the {@linkplain #register() registration} of tag type instances.
92   * </dl>
93   */
94  public abstract class TagType {
95      private final String   description;
96      private final String   startDelimiter;
97      private final char[] startDelimiterCharArray;
98      private final String   closingDelimiter;
99      private final boolean isServerTag;
100     private final String   namePrefix;
101     final String   startDelimiterPrefix;
102 
103     TagType(final String   description, final String   startDelimiter, final String   closingDelimiter, final boolean isServerTag, final String   startDelimiterPrefix) {
104         // startDelimiterPrefix is either "<" or "</"
105         this.description=description;
106         this.startDelimiter=startDelimiter;
107         startDelimiterCharArray=startDelimiter.toCharArray();
108         this.closingDelimiter=closingDelimiter;
109         this.isServerTag=isServerTag;
110         this.namePrefix=startDelimiter.substring(startDelimiterPrefix.length());
111         this.startDelimiterPrefix=startDelimiterPrefix;
112     }
113 
114     /**
115      * Registers this tag type for recognition by the parser.
116      * <br />(<a HREF="TagType.html#RegistrationRelated">registration related</a> method)
117      * <p>
118      * The order of registration affects the <a HREF="TagType.html#Precedence">precedence</a> of the tag type when a potential tag is being parsed.
119      *
120      * @see #deregister()
121      */
122     public final void register() {
123         TagTypeRegister.add(this);
124     }
125     
126     /**
127      * Deregisters this tag type.
128      * <br />(<a HREF="TagType.html#RegistrationRelated">registration related</a> method)
129      *
130      * @see #register()
131      */
132     public final void deregister() {
133         TagTypeRegister.remove(this);
134     }
135 
136     /**
137      * Returns a list of all the currently registered tag types in order of lowest to highest <a HREF="TagType.html#Precedence">precedence</a>.
138      * <br />(<a HREF="TagType.html#RegistrationRelated">registration related</a> method)
139      * @return a list of all the currently registered tag types in order of lowest to highest <a HREF="TagType.html#Precedence">precedence</a>.
140      */
141     public static final List getRegisteredTagTypes() {
142         return TagTypeRegister.getList();
143     }
144 
145     /**
146      * Returns a description of this tag type useful for debugging purposes. 
147      * <br />(<a HREF="TagType.html#Property">property</a> method)
148      *
149      * @return a description of this tag type useful for debugging purposes.
150      */
151     public final String   getDescription() {
152         return description;
153     }
154 
155     /**
156      * Returns the character sequence that marks the start of the tag.
157      * <br />(<a HREF="TagType.html#Property">property</a> method)
158      * <p>
159      * The character sequence must be all in lower case.
160      * <p>
161      * The first character in this property <b>must</b> be '<code>&lt;</code>'.
162      * This is a deliberate limitation of the system which is necessary to retain reasonable performance.
163      * <p>
164      * The second character in this property must be '<code>/</code>' if the implementing class is an {@link EndTagType}.
165      * It must <b>not</b> be '<code>/</code>' if the implementing class is a {@link StartTagType}.
166      * <p>
167      * <dl>
168      *  <dt>Standard Tag Type Values:</dt>
169      *   <dd>
170      *    <table class="bordered" style="margin: 15px" cellspacing="0">
171      *     <tr><th>Tag Type<th>Start Delimiter
172      *     <tr><td>{@link StartTagType#UNREGISTERED}<td><code>&lt;</code>
173      *     <tr><td>{@link StartTagType#NORMAL}<td><code>&lt;</code>
174      *     <tr><td>{@link StartTagType#COMMENT}<td><code>&lt;!--</code>
175      *     <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>&lt;?xml</code>
176      *     <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>&lt;?</code>
177      *     <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>&lt;!doctype</code>
178      *     <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>&lt;!</code>
179      *     <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>&lt;![cdata[</code>
180      *     <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>&lt;%</code>
181      *     <tr><td>{@link EndTagType#UNREGISTERED}<td><code>&lt;/</code>
182      *     <tr><td>{@link EndTagType#NORMAL}<td><code>&lt;/</code>
183      *    </table>
184      * </dl>
185      * <dl>
186      *  <dt>Extended Tag Type Values:</dt>
187      *   <dd>
188      *    <table class="bordered" style="margin: 15px" cellspacing="0">
189      *     <tr><th>Tag Type<th>Start Delimiter
190      *     <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>&lt;script</code>
191      *     <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>&lt;?</code>
192      *     <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>&lt;?php</code>
193      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>&lt;&amp;</code>
194      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>&lt;&amp;|</code>
195      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>&lt;/&amp;</code>
196      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>&lt;%</code>
197      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>&lt;/%</code>
198      *    </table>
199      * </dl>
200      *
201      * @return the character sequence that marks the start of the tag.
202      */
203     public final String   getStartDelimiter() {
204         return startDelimiter;
205     }
206 
207     /**
208      * Returns the character sequence that marks the end of the tag.
209      * <br />(<a HREF="TagType.html#Property">property</a> method)
210      * <p>
211      * The character sequence must be all in lower case.
212      * <p>
213      * In a {@link StartTag} of a {@linkplain StartTagType type} that {@linkplain StartTagType#hasAttributes() has attributes},
214      * characters appearing inside a quoted attribute value are ignored when determining the location of the closing delimiter.
215      * <p>
216      * Note that the optional '<code>/</code>' character preceding the closing '<code>&gt;</code>' in an
217      * {@linkplain StartTag#isEmptyElementTag() empty-element tag} is not considered part of the end delimiter.
218      * This property must define the closing delimiter common to all instances of the tag type.
219      * <p>
220      * <dl>
221      *  <dt>Standard Tag Type Values:</dt>
222      *   <dd>
223      *    <table class="bordered" style="margin: 15px" cellspacing="0">
224      *     <tr><th>Tag Type<th>Closing Delimiter
225      *     <tr><td>{@link StartTagType#UNREGISTERED}<td><code>&gt;</code>
226      *     <tr><td>{@link StartTagType#NORMAL}<td><code>&gt;</code>
227      *     <tr><td>{@link StartTagType#COMMENT}<td><code>--&gt;</code>
228      *     <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>?&gt;</code>
229      *     <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>?&gt;</code>
230      *     <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>&gt;</code>
231      *     <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>&gt;</code>
232      *     <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>]]&gt;</code>
233      *     <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>%&gt;</code>
234      *     <tr><td>{@link EndTagType#UNREGISTERED}<td><code>&gt;</code>
235      *     <tr><td>{@link EndTagType#NORMAL}<td><code>&gt;</code>
236      *    </table>
237      * </dl>
238      * <dl>
239      *  <dt>Extended Tag Type Values:</dt>
240      *   <dd>
241      *    <table class="bordered" style="margin: 15px" cellspacing="0">
242      *     <tr><th>Tag Type<th>Closing Delimiter
243      *     <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>&gt;</code>
244      *     <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>?&gt;</code>
245      *     <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>?&gt;</code>
246      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>&amp;&gt;</code>
247      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>&amp;&gt;</code>
248      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>&gt;</code>
249      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>&gt;</code>
250      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>&gt;</code>
251      *    </table>
252      * </dl>
253      *
254      * @return the character sequence that marks the end of the tag.
255      */
256     public final String   getClosingDelimiter() {
257         return closingDelimiter;
258     }
259 
260     /**
261      * Indicates whether this tag type represents a server tag.
262      * <br />(<a HREF="TagType.html#Property">property</a> method)
263      * <p>
264      * Server tags are typically parsed by some process on the web server and substituted with other text or markup before delivery to the
265      * <a target="_blank" HREF="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>.
266      * This parser therefore handles them differently to non-server tags in that they can occur at any position in the document
267      * without regard for the HTML document structure.  As a result they can occur anywhere inside any other tag and vice versa.
268      * <p>
269      * To avoid the problem of server tags interfering with the proper parsing of the rest of the document, the
270      * {@link Segment#ignoreWhenParsing()} method can be called on all server tags found in the document before parsing the non-server tags.
271      * <p>
272      * The documentation of the <a HREF="Tag.html#ParsingProcess">tag parsing process</a> explains in detail 
273      * how the value of this property affects the recognition of a tag.
274      * <p>
275      * <dl>
276      *  <dt>Standard Tag Type Values:</dt>
277      *   <dd>
278      *    <table class="bordered" style="margin: 15px" cellspacing="0">
279      *     <tr><th>Tag Type<th>Is Server Tag
280      *     <tr><td>{@link StartTagType#UNREGISTERED}<td><code>false</code>
281      *     <tr><td>{@link StartTagType#NORMAL}<td><code>false</code>
282      *     <tr><td>{@link StartTagType#COMMENT}<td><code>false</code>
283      *     <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>false</code>
284      *     <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>false</code>
285      *     <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>false</code>
286      *     <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>false</code>
287      *     <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>false</code>
288      *     <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>true</code>
289      *     <tr><td>{@link EndTagType#UNREGISTERED}<td><code>false</code>
290      *     <tr><td>{@link EndTagType#NORMAL}<td><code>false</code>
291      *    </table>
292      * </dl>
293      * <dl>
294      *  <dt>Extended Tag Type Values:</dt>
295      *   <dd>
296      *    <table class="bordered" style="margin: 15px" cellspacing="0">
297      *     <tr><th>Tag Type<th>Is Server Tag
298      *     <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>true</code>
299      *     <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>true</code>
300      *     <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>true</code>
301      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>true</code>
302      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>true</code>
303      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>true</code>
304      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>true</code>
305      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>true</code>
306      *    </table>
307      * </dl>
308      *
309      * @return <code>true</code> if this tag type represents a server tag, otherwise <code>false</code>.
310      */
311     public final boolean isServerTag() {
312         return isServerTag;
313     }
314 
315     /**
316      * Returns the {@linkplain Tag#getName() name} prefix required by this tag type.
317      * <br />(<a HREF="TagType.html#Property">property</a> method)
318      * <p>
319      * This string is identical to the {@linkplain #getStartDelimiter() start delimiter}, except that it does not include the
320      * initial "<code>&lt;</code>" or "<code>&lt;/</code>" characters that always prefix the start delimiter of a
321      * {@link StartTagType} or {@link EndTagType} respectively.
322      * <p>
323      * The {@linkplain Tag#getName() name} of a tag of this type may or may not include extra characters after the prefix.
324      * This is determined by properties such as {@link StartTagType#isNameAfterPrefixRequired()}
325      * or {@link EndTagTypeGenericImplementation#isStatic()}. 
326      * <p>
327      * <dl>
328      *  <dt>Standard Tag Type Values:</dt>
329      *   <dd>
330      *    <table class="bordered" style="margin: 15px" cellspacing="0">
331      *     <tr><th>Tag Type<th>Name Prefix
332      *     <tr><td>{@link StartTagType#UNREGISTERED}<td><i>(empty string)</i>
333      *     <tr><td>{@link StartTagType#NORMAL}<td><i>(empty string)</i>
334      *     <tr><td>{@link StartTagType#COMMENT}<td><code>!--</code>
335      *     <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>?xml</code>
336      *     <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>?</code>
337      *     <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>!doctype</code>
338      *     <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>!</code>
339      *     <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>![cdata[</code>
340      *     <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>%</code>
341      *     <tr><td>{@link EndTagType#UNREGISTERED}<td><i>(empty string)</i>
342      *     <tr><td>{@link EndTagType#NORMAL}<td><i>(empty string)</i>
343      *    </table>
344      * </dl>
345      * <dl>
346      *  <dt>Extended Tag Type Values:</dt>
347      *   <dd>
348      *    <table class="bordered" style="margin: 15px" cellspacing="0">
349      *     <tr><th>Tag Type<th>Name Prefix
350      *     <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>script</code>
351      *     <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>?</code>
352      *     <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>?php</code>
353      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>&amp;</code>
354      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>&amp;|</code>
355      *     <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>&amp;</code>
356      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>%</code>
357      *     <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>%</code>
358      *    </table>
359      * </dl>
360      *
361      * @return the {@linkplain Tag#getName() name} prefix required by this tag type.
362      * @see #getStartDelimiter()
363      */
364     protected final String   getNamePrefix() {
365         return namePrefix;
366     }
367 
368     /**
369      * Indicates whether a tag of this type is valid in the specified position of the specified source document.
370      * <br />(<a HREF="TagType.html#ImplementationAssistance">implementation assistance</a> method)
371      * <p>
372      * This method is called immediately before {@link #constructTagAt(Source, int pos)}
373      * to do a preliminary check on the validity of a tag of this type in the specified position.
374      * <p>
375      * This check is not performed as part of the {@link #constructTagAt(Source, int pos)} call because the same
376      * validation is used for all the <a HREF="TagType.html#Standard">standard</a> tag types, and is likely to be sufficient
377      * for all <a HREF="TagType.html#Custom">custom tag types</a>.
378      * Having this check separated into a different method helps to isolate common code from the code that is unique to each tag type.
379      * <p>
380      * In theory, a {@linkplain TagType#isServerTag() server tag} is valid in any position, but a non-server tag is not valid inside another non-server tag.
381      * <p>
382      * The common implementation of this method always returns <code>true</code> for server tags, but for non-server tags it behaves slightly differently
383      * depending upon whether or not a {@linkplain Source#fullSequentialParse() full sequential parse} is being peformed.
384      * If so, it implements the exact theoretical check and rejects a non-server tag if it is inside any other non-server tag.
385      * If a full sequential parse was not performed (i.e. in <a HREF="Source.html#ParseOnDemand">parse on demand</a> mode),
386      * practical constraints do not permit the implementation of the exact theoretical check, and non-server tags are only rejected 
387      * if they are found inside HTML {@linkplain StartTagType#COMMENT comments} or {@linkplain StartTagType#CDATA_SECTION CDATA sections}.
388      * <p>
389      * This behaviour is configurable by manipulating the static {@link TagType#getTagTypesIgnoringEnclosedMarkup() TagTypesIgnoringEnclosedMarkup} array
390      * to determine which tag types can not contain non-server tags.
391      * The {@linkplain TagType#getTagTypesIgnoringEnclosedMarkup() documentation of this property} contains
392      * a more detailed analysis of the subject and explains why only the {@linkplain StartTagType#COMMENT comment} and 
393      * {@linkplain StartTagType#CDATA_SECTION CDATA section} tag types are included by default.
394      * <p>
395      * See the documentation of the <a HREF="Tag.html#ParsingProcess">tag parsing process</a> for more information about how this method fits into the whole tag parsing process.
396      * <p>
397      * This method can be overridden in <a HREF="TagType.html#Custom">custom tag types</a> if the default implementation is unsuitable.
398      *
399      * @param source  the {@link Source} document.
400      * @param pos  the character position in the source document to check.
401      * @return <code>true</code> if a tag of this type is valid in the specified position of the specified source document, otherwise <code>false</code>.
402      */
403     protected boolean isValidPosition(final Source source, final int pos) {
404         if (isServerTag()) return true;
405         if (source.endOfLastTagIgnoringEnclosedMarkup!=-1) {
406             // use simplified check when doing full sequential parse.  Normally we are only able to check whether a tag is inside specially cached
407             // tag types for efficiency reasons, but during a full sequential parse we can reject a tag if it is inside normal tags as well.
408             return pos>=source.endOfLastTagIgnoringEnclosedMarkup;
409         }
410         // Use the normal method of checking whether the position is inside a tag of a tag type that ignores enclosed markup:
411         final TagType[] tagTypesIgnoringEnclosedMarkup=getTagTypesIgnoringEnclosedMarkup();
412         for (int i=0; i<tagTypesIgnoringEnclosedMarkup.length; i++)
413             if (tagTypesIgnoringEnclosedMarkup[i].tagEncloses(source,pos)) return false;
414         return true;
415     }
416 
417     /**
418      * Returns an array of all the tag types inside which the parser ignores all other non-{@linkplain #isServerTag() server} tags
419      * in <a HREF="Source.html#ParseOnDemand">parse on demand</a> mode.
420      * <br />(<a HREF="TagType.html#ImplementationAssistance">implementation assistance</a> method)
421      * <p>
422      * The tag types returned by this property (referred to in the following paragraphs as the "listed types") default to
423      * {@link StartTagType#COMMENT} and {@link StartTagType#CDATA_SECTION}.
424      * <p>
425      * In <a HREF="Source.html#ParseOnDemand">parse on demand</a> mode,
426      * every new non-server tag found by the parser (referred to as a "new tag") undergoes a check to see whether it is enclosed
427      * by a tag of one of the listed types, including new tags of the listed types themselves.
428      * The recursive nature of this check means that <i>all</i> tags of the listed types occurring before the new tag must be found 
429      * by the parser before it can determine whether the new tag should be ignored.
430      * To mitigate any performance issues arising from this process, the listed types are given special treatment in the tag cache.
431      * This dramatically decreases the time taken to search on these tag types, so adding a tag type to this array that 
432      * is easily recognised and occurs infrequently only results in a small degradation in overall performance.
433      * <p>
434      * Theoretically, non-server tags appearing inside <i>any</i> other non-server tag should be ignored.
435      * One situation where a tag can legitimately contain a sequence of characters that resembles a tag,
436      * which shouldn't be recognised as a tag by the parser, is within an attribute value.
437      * The <a target="_blank" HREF="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>
438      * specifically allows the presence of '<code>&lt;</code>' and '<code>&gt;</code>' characters within attribute values.
439      * A common occurrence of this is in <a target="_blank" HREF="http://www.w3.org/TR/html401/interact/scripts.html#events">event</a>
440      * attributes such as <code><a target="_blank" HREF="http://www.w3.org/TR/html401/interact/scripts.html#adef-onclick">onclick</a></code>,
441      * which contain scripts that often dynamically load new HTML into the document
442      * (see the file <code><a target="_blank" HREF="../../../../../../../samples/data/Test.html#TagInsideTag">samples/data/Test.html</a></code> for an example).
443      * <p>
444      * Performing a {@linkplain Source#fullSequentialParse() full sequential parse} of the source document prevents these attribute values from being
445      * recognised as tags, but can be very expensive if only a few tags in the document need to be parsed.
446      * The penalty of not parsing every tag in the document is that the exactness of this check is compromised, but in practical terms the difference is inconsequential.
447      * The default listed types of {@linkplain StartTagType#COMMENT comments} and {@linkplain StartTagType#CDATA_SECTION CDATA sections} yields sensible results 
448      * in the vast majority of practical applications with only a minor impact on performance.
449      * <p>
450      * In <a target="_blank" HREF="http://www.w3.org/TR/xhtml1/">XHTML</a>, '<code>&lt;</code>' and '<code>&gt;</code>' characters 
451      * must be represented in attribute values as {@linkplain CharacterReference character references}
452      * (see the XML 1.0 specification section <a target="_blank" HREF="http://www.w3.org/TR/REC-xml#CleanAttrVals">3.1</a>),
453      * so the situation should never arise that a tag is found inside another tag unless one of them is a
454      * {@linkplain #isServerTag() server tag}.
455      * <p>
456      * This method is called from the default implementation of the {@link #isValidPosition(Source, int pos)} method.
457      *
458      * @return an array of all the tag types inside which the parser ignores all other non-{@linkplain #isServerTag() server} tags.
459      */
460     public static final TagType[] getTagTypesIgnoringEnclosedMarkup() {
461         return TagTypesIgnoringEnclosedMarkup.array;
462     }
463 
464     /**
465      * Sets the tag types inside which the parser ignores all other non-{@linkplain #isServerTag() server} tags.
466      * <br />(<a HREF="TagType.html#ImplementationAssistance">implementation assistance</a> method)
467      * <p>
468      * See {@link #getTagTypesIgnoringEnclosedMarkup()} for the documentation of this property.
469      *
470      * @param tagTypes  an array of tag types.
471      */
472     public static final void setTagTypesIgnoringEnclosedMarkup(TagType[] tagTypes) {
473         if (tagTypes==null) throw new IllegalArgumentException  ();
474         TagTypesIgnoringEnclosedMarkup.array=tagTypes;
475     }
476 
477     /**
478      * Constructs a tag of this type at the specified position in the specified source document if it matches all of the required features.
479      * <br />(<a HREF="TagType.html#AbstractImplementation">abstract implementation</a> method)
480      * <p>
481      * The implementation of this method must check that the text at the specified position meets all of
482      * the criteria of this tag type, including such checks as the presence of the correct or well formed
483      * {@linkplain #getClosingDelimiter() closing delimiter}, {@linkplain Tag#getName() name}, {@linkplain Attributes attributes},
484      * {@linkplain EndTag end tag}, or any other distinguishing features.
485      * <p>
486      * It can be assumed that the specified position starts with the {@linkplain #getStartDelimiter() start delimiter} of this tag type,
487      * and that all other tag types with higher <a HREF="TagType.html#Precedence">precedence</a> (if any) have already been rejected as candidates.
488      * Tag types with lower precedence will be considered if this method returns <code>null</code>.
489      * <p>
490      * This method is only called after a successful check of the tag's position, i.e.
491      * {@link #isValidPosition(Source,int) isValidPosition(source,pos)}<code>==true</code>.
492      * <p>
493      * The {@link StartTagTypeGenericImplementation} and {@link EndTagTypeGenericImplementation} subclasses provide default
494      * implementations of this method that allow the use of much simpler <a HREF="TagType.html#Property">properties</a> and
495      * <a HREF="TagType.html#ImplementationAssistance">implementation assistance</a> methods and to carry out the required functions.
496      *
497      * @param source  the {@link Source} document.
498      * @param pos  the position in the source document.
499      * @return a tag of this type at the specified position in the specified source document if it meets all of the required features, or <code>null</code> if it does not meet the criteria.
500      */
501     protected abstract Tag constructTagAt(Source source, int pos);
502 
503     /**
504      * Indicates whether a tag of this type encloses the specified position of the specified source document.
505      * <br />(<a HREF="TagType.html#ImplementationAssistance">implementation assistance</a> method)
506      * <p>
507      * This is logically equivalent to <code>source.</code>{@link Source#findEnclosingTag(int,TagType) findEnclosingTag(pos,this)}<code>!=null</code>,
508      * but is safe to use within other implementation methods without the risk of causing an infinite recursion.
509      * <p>
510      * This method is called by the {@link TagType} implementation of {@link #isValidPosition(Source, int pos)}.
511      *
512      * @param source  the {@link Source} document.
513      * @param pos  the character position in the source document to check.
514      * @return <code>true</code> if a tag of this type encloses the specified position of the specified source document, otherwise <code>false</code>.
515      */
516     protected final boolean tagEncloses(final Source source, final int pos) {
517         if (pos==0) return false;
518         final Tag enclosingTag=source.findEnclosingTag(pos-1,this); // use pos-1 otherwise a tag at pos could cause infinite recursion when this is called from constructTagAt
519         return enclosingTag!=null && pos!=enclosingTag.getEnd(); // make sure pos!=enclosingTag.getEnd() to compensate for using pos-1 above (important if the tag in question immediately follows an end tag delimiter)
520     }
521 
522     /**
523      * Returns a string representation of this object useful for debugging purposes.
524      * @return a string representation of this object useful for debugging purposes.
525      */
526     public String   toString() {
527         return getDescription();
528     }
529 
530     static final Tag getTagAt(final Source source, final int pos, final boolean assumeNoNestedTags) {
531         final TagTypeRegister.ProspectiveTagTypeIterator prospectiveTagTypeIterator=new TagTypeRegister.ProspectiveTagTypeIterator(source,pos);
532         // prospectiveTagTypeIterator is empty if pos is out of range.
533         while (prospectiveTagTypeIterator.hasNext()) {
534             final TagType tagType=prospectiveTagTypeIterator.getNextTagType();
535             if (assumeNoNestedTags || tagType.isValidPosition(source,pos)) {
536                 try {
537                     final Tag tag=tagType.constructTagAt(source,pos);
538                     if (tag!=null) return tag;
539                 } catch (IndexOutOfBoundsException   ex) {
540                     if (source.isLoggingEnabled()) source.log(source.getRowColumnVector(pos).appendTo(new StringBuffer  (200).append("Tag at ")).append(" not recognised as type '").append(tagType.getDescription()).append("' because it has no end delimiter").toString());
541                 }
542             }
543         }
544         return null;
545     }
546 
547     final String   getNamePrefixForTagConstant() {
548         // this method is only used in deprecated constants and will eventually be removed
549         return getNamePrefix();
550     }
551     
552     final char[] getStartDelimiterCharArray() {
553         return startDelimiterCharArray;
554     }
555 
556     private static final class TagTypesIgnoringEnclosedMarkup {
557         // This internal class is used to contain the array because its static initialisation can occur after
558         // the StartTagType.COMMENT and StartTagType.CDATA_SECTION members have been created.
559         public static TagType[] array=new TagType[] {
560             StartTagType.COMMENT,
561             StartTagType.CDATA_SECTION
562         };
563     }
564 }
565
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags