Segment


1   // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2   // Version 2.2
3   // Copyright (C) 2006 Martin Jericho
4   // http://sourceforge.net/projects/jerichohtml/
5   //
6   // This library is free software; you can redistribute it and/or
7   // modify it under the terms of the GNU Lesser General Public
8   // License as published by the Free Software Foundation; either
9   // version 2.1 of the License, or (at your option) any later version.
10  // http://www.gnu.org/copyleft/lesser.html
11  //
12  // This library is distributed in the hope that it will be useful,
13  // but WITHOUT ANY WARRANTY; without even the implied warranty of
14  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  // Lesser General Public License for more details.
16  //
17  // You should have received a copy of the GNU Lesser General Public
18  // License along with this library; if not, write to the Free Software
19  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  
21  package au.id.jericho.lib.html;
22  
23  import java.util.*;
24  
25  /**
26   * Represents a segment of a {@link Source} document.
27   * <p>
28   * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
29   */
30  public class Segment implements Comparable  , CharSequence   {
31      final int begin;
32      final int end;
33      final Source source;
34      
35      List childElements=null;
36  
37      private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method
38  
39      /**
40       * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
41       * @param source  the {@link Source} document, must not be <code>null</code>.
42       * @param begin  the character position in the source where this segment begins.
43       * @param end  the character position in the source where this segment ends.
44       */
45      public Segment(final Source source, final int begin, final int end) {
46          if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException  ();
47          this.begin=begin;
48          this.end=end;
49          if (source==null) throw new IllegalArgumentException  ("source argument must not be null");
50          this.source=source;
51      }
52  
53      // Only called from Source constructor
54      Segment(final int length) {
55          begin=0;
56          this.end=length;
57          source=(Source)this;
58      }
59  
60      // Only used for creating dummy flag instances of this type (see Element.NOT_CACHED)
61      Segment() {
62          begin=0;
63          end=0;
64          source=null;
65      }
66  
67      /**
68       * Returns the character position in the {@link Source} document at which this segment begins.
69       * @return the character position in the {@link Source} document at which this segment begins.
70       */
71      public final int getBegin() {
72          return begin;
73      }
74  
75      /**
76       * Returns the character position in the {@link Source} document immediately after the end of this segment.
77       * <p>
78       * The character at the position specified by this property is <b>not</b> included in the segment.
79       *
80       * @return the character position in the {@link Source} document immediately after the end of this segment.
81       */
82      public final int getEnd() {
83          return end;
84      }
85  
86      /**
87       * Compares the specified object with this <code>Segment</code> for equality.
88       * <p>
89       * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
90       * and both segments have the same {@link Source}, and the same begin and end positions.
91       * @param object  the object to be compared for equality with this <code>Segment</code>.
92       * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
93       */
94      public final boolean equals(final Object   object) {
95          if (object==null || !(object instanceof Segment)) return false;
96          final Segment segment=(Segment)object;
97          return segment.begin==begin && segment.end==end && segment.source==source;
98      }
99  
100     /**
101      * Returns a hash code value for the segment.
102      * <p>
103      * The current implementation returns the sum of the begin and end positions, although this is not
104      * guaranteed in future versions.
105      *
106      * @return a hash code value for the segment.
107      */
108     public int hashCode() {
109         return begin+end;
110     }
111 
112     /**
113      * Returns the length of the segment.
114      * This is defined as the number of characters between the begin and end positions.
115      * @return the length of the segment.
116      */
117     public final int length() {
118         return end-begin;
119     }
120 
121     /**
122      * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
123      * <p>
124      * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
125      *
126      * @param segment  the segment to be tested for being enclosed by this segment.
127      * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
128      */
129     public final boolean encloses(final Segment segment) {
130         return begin<=segment.begin && end>=segment.end;
131     }
132 
133     /**
134      * Indicates whether this segment encloses the specified character position in the source document.
135      * <p>
136      * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
137      *
138      * @param pos  the position in the {@link Source} document.
139      * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
140      */
141     public final boolean encloses(final int pos) {
142         return begin<=pos && pos<end;
143     }
144 
145     /**
146      * Returns the source text of this segment as a <code>String</code>.
147      * <p>
148      * The returned <code>String</code> is newly created with every call to this method, unless this
149      * segment is itself an instance of {@link Source}.
150      * <p>
151      * Note that before version 2.0 this returned a representation of this object useful for debugging purposes,
152      * which can now be obtained via the {@link #getDebugInfo()} method.
153      *
154      * @return the source text of this segment as a <code>String</code>.
155      */
156     public String   toString() {
157         return source.string.substring(begin,end).toString();
158     }
159 
160     /**
161      * Extracts the text content of this segment.
162      * <p>
163      * This method removes all of the tags from the segment and
164      * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
165      * <p>
166      * See the documentation of the {@link #extractText(boolean includeAttributes)} method for more details.
167      * <p>
168      * This is equivalent to calling {@link #extractText(boolean) extractText(false)}.
169      *
170      * @return the text content of this segment.
171      */
172     public String   extractText() {
173         return extractText(false);
174     }
175 
176     /**
177      * Extracts the text content of this segment.
178      * <p>
179      * This method removes all of the tags from the segment and
180      * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
181      * Tags are also converted to whitespace unless they belong to an
182      * {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
183      * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to whitespace despite being an inline-level element.
184      * <p>
185      * Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
186      * is ignored.
187      * <p>
188      * Specifying a value of <code>true</code> as an argument to the <code>includeAttributes</code> parameter causes the values of 
189      * <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
190      * <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
191      * <a target="_blank" HREF="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and
192      * <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>
193      * attributes of {@linkplain StartTagType#NORMAL normal} tags to be included in the extracted text.
194      * <p>
195      * <dl>
196      *  <dt>Example:</dt>
197      *  <dd>source segment "<code>&lt;div&gt;&lt;b&gt;O&lt;/b&gt;ne&lt;/div&gt;&lt;div&gt;&lt;b&gt;T&lt;/b&gt;&lt;script&gt;//a&nbsp;script&nbsp;&lt;/script&gt;wo&lt;/div&gt;</code>"
198      *   produces the text "<code>One Two</code>".
199      * </dl>
200      * <p>
201      * Note that in version 2.1, no tags were converted to whitespace and text inside {@link HTMLElementName#SCRIPT SCRIPT} and
202      * {@link HTMLElementName#STYLE STYLE} elements was included.  The example above produced the text "<code>OneT//a script wo</code>".
203      *
204      * @param includeAttributes  indicates whether the values of <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>, <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>, <a target="_blank" HREF="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a> attributes are included.
205      * @return the text content of this segment.
206      */
207     public String   extractText(final boolean includeAttributes) {
208         final StringBuffer   sb=new StringBuffer  (length());
209         int textBegin=begin;
210         // use findAllTags().iterator() instead of source.findNextTag(textBegin) to take advantage of allTags cache in Source object
211         for (final Iterator i=findAllTags().iterator(); i.hasNext();) {
212             final Tag tag=(Tag)i.next();
213             final int textEnd=tag.begin;
214             if (textEnd<textBegin) continue;
215             while (textBegin<textEnd) sb.append(source.charAt(textBegin++));
216             if (tag.getTagType()==StartTagType.NORMAL) {
217                 if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE) {
218                     final EndTag endTag=source.findNextEndTag(tag.end,tag.name,EndTagType.NORMAL);
219                     if (endTag!=null) {
220                         textBegin=endTag.end;
221                         while (i.hasNext() && i.next()!=endTag) {}
222                         continue;
223                     }
224                 }
225                 if (includeAttributes) {
226                     final StartTag startTag=(StartTag)tag;
227                     // add title attribute:
228                     final Attribute titleAttribute=startTag.getAttributes().get("title");
229                     if (titleAttribute!=null) sb.append(' ').append(titleAttribute.getValueSegment()).append(' ');
230                     // add alt attribute (APPLET, AREA, IMG and INPUT elements):
231                     final Attribute altAttribute=startTag.getAttributes().get("alt");
232                     if (altAttribute!=null) sb.append(' ').append(altAttribute.getValueSegment()).append(' ');
233                     // add label attribute (OPTION and OPTGROUP elements):
234                     final Attribute labelAttribute=startTag.getAttributes().get("label");
235                     if (labelAttribute!=null) sb.append(' ').append(labelAttribute.getValueSegment()).append(' ');
236                     // add summary attribute (TABLE element):
237                     final Attribute summaryAttribute=startTag.getAttributes().get("summary");
238                     if (summaryAttribute!=null) sb.append(' ').append(summaryAttribute.getValueSegment()).append(' ');
239                     // don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
240                 }
241             }
242             // Treat tags not belonging to inline-level elements as whitespace:
243             if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append('\n');
244             textBegin=tag.end;
245         }
246         while (textBegin<end) sb.append(source.charAt(textBegin++));
247         final String   decodedText=CharacterReference.decodeCollapseWhiteSpace(sb);
248         return decodedText;
249     }
250 
251     /**
252      * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
253      * <p>
254      * See the {@link Tag} class documentation for more details about the behaviour of this method.
255      *
256      * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
257      */
258     public List findAllTags() {
259         return findAllTags(null);
260     }
261 
262     /**
263      * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
264      * <p>
265      * See the {@link Tag} class documentation for more details about the behaviour of this method.
266      * <p>
267      * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #findAllTags()}.
268      *
269      * @param tagType  the {@linkplain TagType type} of tags to find.
270      * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
271      */
272     public List findAllTags(final TagType tagType) {
273         Tag tag=checkEnclosure(Tag.findPreviousOrNextTag(source,begin,tagType,false));
274         if (tag==null) return Collections.EMPTY_LIST;
275         final ArrayList list=new ArrayList();
276         do {
277             list.add(tag);
278             tag=checkEnclosure(Tag.findPreviousOrNextTag(source,tag.begin+1,tagType,false));
279         } while (tag!=null);
280         return list;
281     }
282 
283     /**
284      * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
285      * <p>
286      * See the {@link Tag} class documentation for more details about the behaviour of this method.
287      *
288      * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
289      */
290     public List findAllStartTags() {
291         return findAllStartTags(null);
292     }
293 
294     /**
295      * Returns a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
296      * <p>
297      * See the {@link Tag} class documentation for more details about the behaviour of this method.
298      * <p>
299      * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #findAllStartTags()}.
300      * <p>
301      * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
302      *
303      * @param name  the {@linkplain StartTag#getName() name} of the start tags to find.
304      * @return a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
305      */
306     public List findAllStartTags(String   name) {
307         if (name!=null) name=name.toLowerCase();
308         final boolean isXMLTagName=Tag.isXMLName(name);
309         StartTag startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,begin,name,isXMLTagName,false));
310         if (startTag==null) return Collections.EMPTY_LIST;
311         final ArrayList list=new ArrayList();
312         do {
313             list.add(startTag);
314             startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,startTag.begin+1,name,isXMLTagName,false));
315         } while (startTag!=null);
316         return list;
317     }
318 
319     /**
320      * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair 
321      * that are {@linkplain #encloses(Segment) enclosed} by this segment.
322      * <p>
323      * See the {@link Tag} class documentation for more details about the behaviour of this method.
324      *
325      * @param attributeName  the attribute name (case insensitive) to search for, must not be <code>null</code>.
326      * @param value  the value of the specified attribute to search for, must not be <code>null</code>.
327      * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
328      * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
329      */
330     public List findAllStartTags(final String   attributeName, final String   value, final boolean valueCaseSensitive) {
331         StartTag startTag=(StartTag)checkEnclosure(source.findNextStartTag(begin,attributeName,value,valueCaseSensitive));
332         if (startTag==null) return Collections.EMPTY_LIST;
333         final ArrayList list=new ArrayList();
334         do {
335             list.add(startTag);
336             startTag=(StartTag)checkEnclosure(source.findNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
337         } while (startTag!=null);
338         return list;
339     }
340 
341     /**
342      * Returns a list of the immediate children of this segment in the document element hierarchy.
343      * <p>
344      * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
345      * <p>
346      * The objects in the list are all of type {@link Element}.
347      * <p>
348      * See the {@link Source#getChildElements()} method for more details.
349      *
350      * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
351      * @see Element#getParentElement()
352      */
353     public List getChildElements() {
354         if (childElements==null) {
355             if (length()==0) {
356                 childElements=Collections.EMPTY_LIST;
357             } else {
358                 childElements=new ArrayList();
359                 int pos=begin;
360                 while (true) {
361                     final StartTag childStartTag=source.findNextStartTag(pos);
362                     if (childStartTag==null || childStartTag.begin>=end) break;
363                     if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
364                         pos=childStartTag.end;
365                         continue;
366                     }
367                     final Element childElement=childStartTag.getElement();
368                     childElements.add(childElement);
369                     childElement.getChildElements();
370                     pos=childElement.end;
371                 }
372             }
373         }
374         return childElements;
375     }
376 
377     /**
378      * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
379      * <p>
380      * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags()} method.
381      *
382      * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
383      */
384     public List findAllElements() {
385         return findAllElements((String  )null);
386     }
387 
388     /**
389      * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
390      * <p>
391      * The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags(String name)} method.
392      * <p>
393      * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #findAllElements()}.
394      * <p>
395      * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
396      *
397      * @param name  the {@linkplain Element#getName() name} of the elements to find.
398      * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
399      */
400     public List findAllElements(String   name) {
401         if (name!=null) name=name.toLowerCase();
402         final List startTags=findAllStartTags(name);
403         if (startTags.isEmpty()) return Collections.EMPTY_LIST;
404         final ArrayList elements=new ArrayList(startTags.size());
405         for (final Iterator i=startTags.iterator(); i.hasNext();) {
406             final StartTag startTag=(StartTag)i.next();
407             final Element element=startTag.getElement();
408             if (element.end>end) break;
409             elements.add(element);
410         }
411         return elements;
412     }
413 
414     /**
415      * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
416      * <p>
417      * The elements returned correspond exactly with the start tags returned in the {@link #findAllTags(TagType)} method.
418      *
419      * @param startTagType  the {@linkplain StartTagType type} of start tags to find, must not be <code>null</code>.
420      * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
421      */
422     public List findAllElements(final StartTagType startTagType) {
423         final List startTags=findAllTags(startTagType);
424         if (startTags.isEmpty()) return Collections.EMPTY_LIST;
425         final ArrayList elements=new ArrayList(startTags.size());
426         for (final Iterator i=startTags.iterator(); i.hasNext();) {
427             final StartTag startTag=(StartTag)i.next();
428             final Element element=startTag.getElement();
429             if (element.end>end) break;
430             elements.add(element);
431         }
432         return elements;
433     }
434 
435     /**
436      * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
437      * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
438      */
439     public List findAllCharacterReferences() {
440         CharacterReference characterReference=findNextCharacterReference(begin);
441         if (characterReference==null) return Collections.EMPTY_LIST;
442         final ArrayList list=new ArrayList();
443         do {
444             list.add(characterReference);
445             characterReference=findNextCharacterReference(characterReference.end);
446         } while (characterReference!=null);
447         return list;
448     }
449 
450     /**
451      * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
452      * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
453      */
454     public List findFormControls() {
455         return FormControl.findAll(this);
456     }
457 
458     /**
459      * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
460      * <p>
461      * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #findFormControls()}<code>)</code>.
462      *
463      * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
464      * @see #findFormControls()
465      */
466     public FormFields findFormFields() {
467         return new FormFields(findFormControls());
468     }
469 
470     /**
471      * Parses any {@link Attributes} within this segment.
472      * This method is only used in the unusual situation where attributes exist outside of a start tag.
473      * The {@link StartTag#getAttributes()} method should be used in normal situations.
474      * <p>
475      * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
476      *
477      * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
478      */
479     public Attributes parseAttributes() {
480         return source.parseAttributes(begin,end);
481     }
482 
483     /**
484      * Causes the this segment to be ignored when parsing.
485      * <p>
486      * This method is usually used to exclude {@linkplain TagType#isServerTag() server tags} or other non-HTML segments from the source text
487      * so that they do not interfere with the parsing of the surrounding HTML.
488      * <p>
489      * This is necessary because many server tags are used as attribute values and in other places within
490      * HTML tags, and very often contain characters that prevent the parser from recognising the surrounding tag.
491      * <p>
492      * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
493      * and so will continue to be found by the <a HREF="Tag.html#TagSearchMethods">tag search methods</a>.
494      * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
495      * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
496      * <p>
497      * For efficiency reasons, this method should be called on all segments that need to be ignored without calling
498      * any of the <a HREF="Tag.html#TagSearchMethods">tag search methods</a> in between.
499      *
500      * @see Source#ignoreWhenParsing(Collection segments)
501      */
502     public void ignoreWhenParsing() {
503         source.ignoreWhenParsing(begin,end);
504     }
505 
506     /**
507      * Compares this <code>Segment</code> object to another object.
508      * <p>
509      * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
510      * <p>
511      * A segment is considered to be before another segment if its begin position is earlier,
512      * or in the case that both segments begin at the same position, its end position is earlier.
513      * <p>
514      * Segments that begin and end at the same position are considered equal for
515      * the purposes of this comparison, even if they relate to different source documents.
516      * <p>
517      * Note: this class has a natural ordering that is inconsistent with equals.
518      * This means that this method may return zero in some cases where calling the
519      * {@link #equals(Object)} method with the same argument returns <code>false</code>.
520      *
521      * @param o  the segment to be compared
522      * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
523      * @throws ClassCastException if the argument is not a <code>Segment</code>
524      */
525     public int compareTo(final Object   o) {
526         if (this==o) return 0;
527         final Segment segment=(Segment)o;
528         if (begin<segment.begin) return -1;
529         if (begin>segment.begin) return 1;
530         if (end<segment.end) return -1;
531         if (end>segment.end) return 1;
532         return 0;
533     }
534 
535     /**
536      * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
537      * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
538      */
539     public final boolean isWhiteSpace() {
540         for (int i=begin; i<end; i++)
541             if (!isWhiteSpace(source.charAt(i))) return false;
542         return true;
543     }
544 
545     /**
546      * Indicates whether the specified character is <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
547      * <p>
548      * The <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
549      * specifies the following white space characters:
550      * <ul>
551      *  <li>space (U+0020)
552      *  <li>tab (U+0009)
553      *  <li>form feed (U+000C)
554      *  <li>line feed (U+000A)
555      *  <li>carriage return (U+000D)
556      *  <li>zero-width space (U+200B)
557      * </ul>
558      * <p>
559      * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
560      * recognise them as whitespace and renders them as an unprintable character (empty square).
561      * Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
562      *
563      * @param ch  the character to test.
564      * @return <code>true</code> if the specified character is <a target="_blank" HREF="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
565      */
566     public static final boolean isWhiteSpace(final char ch) {
567         for (int i=0; i<WHITESPACE.length; i++)
568             if (ch==WHITESPACE[i]) return true;
569         return false;
570     }
571 
572     /**
573      * Returns a string representation of this object useful for debugging purposes.
574      * @return a string representation of this object useful for debugging purposes.
575      */
576     public String   getDebugInfo() {
577         final StringBuffer   sb=new StringBuffer  (50);
578         sb.append('(');
579         source.getRowColumnVector(begin).appendTo(sb);
580         sb.append('-');
581         source.getRowColumnVector(end).appendTo(sb);
582         sb.append(')');
583         return sb.toString();
584     }
585 
586     /**
587      * Returns the character at the specified index.
588      * <p>
589      * This is logically equivalent to <code>toString().charAt(index)</code>
590      * for valid argument values <code>0 <= index < length()</code>.
591      * <p>
592      * However because this implementation works directly on the underlying document source string,
593      * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
594      * for an invalid argument value.
595      *
596      * @param index  the index of the character.
597      * @return the character at the specified index.
598      */
599     public final char charAt(final int index) {
600         return source.string.charAt(begin+index);
601     }
602 
603     /**
604      * Returns a new character sequence that is a subsequence of this sequence.
605      * <p>
606      * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
607      * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
608      * <p>
609      * However because this implementation works directly on the underlying document source string,
610      * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
611      * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
612      *
613      * @param beginIndex  the begin index, inclusive.
614      * @param endIndex  the end index, exclusive.
615      * @return a new character sequence that is a subsequence of this sequence.
616      */
617     public final CharSequence   subSequence(final int beginIndex, final int endIndex) {
618         return source.string.subSequence(begin+beginIndex,begin+endIndex);
619     }
620 
621     /**
622      * Indicates whether this segment is a {@link Tag} of type {@link StartTagType#COMMENT}.
623      * <p>
624      * This method has been deprecated as of version 2.0 as it is not a robust method of checking whether an HTML comment spans this segment.
625      *
626      * @return <code>true</code> if this segment is a {@link Tag} of type {@link StartTagType#COMMENT}, otherwise <code>false</code>.
627      * @deprecated  Use <code>this instanceof </code>{@link Tag}<code> && ((Tag)this).</code>{@link Tag#getTagType() getTagType()}<code>==</code>{@link StartTagType#COMMENT} instead.
628      */
629     public boolean isComment() {
630         return false; // overridden in StartTag
631     }
632 
633     /**
634      * Returns a list of all {@link StartTag} objects representing HTML {@linkplain StartTagType#COMMENT comments} that are {@linkplain #encloses(Segment) enclosed} by this segment.
635      * <p>
636      * This method has been deprecated as of version 2.0 in favour of the more generic {@link #findAllTags(TagType)} method.
637      *
638      * @return a list of all {@link StartTag} objects representing HTML {@linkplain StartTagType#COMMENT comments} that are {@linkplain #encloses(Segment) enclosed} by this segment.
639      * @deprecated  Use {@link #findAllTags(TagType) findAllTags}<code>(</code>{@link StartTagType#COMMENT}<code>)</code> instead.
640      */
641     public List findAllComments() {
642         return findAllTags(StartTagType.COMMENT);
643     }
644 
645     /**
646      * Returns the source text of this segment.
647      * <p>
648      * This method has been deprecated as of version 2.0 as it now duplicates the functionality of the {@link #toString()} method.
649      *
650      * @return the source text of this segment.
651      * @deprecated  Use {@link #toString() toString()} instead.
652      */
653     public String   getSourceText() {
654         return toString();
655     }
656 
657     /**
658      * Returns the source text of this segment without {@linkplain #isWhiteSpace(char) white space}.
659      * <p>
660      * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
661      * <p>
662      * This method has been deprecated as of version 2.0 as it is no longer used internally and
663      * has no practical use as a public method.
664      * It is similar to the new {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method, but
665      * does not {@linkplain CharacterReference#decode(CharSequence) decode} the text after collapsing the white space.
666      * <p>
667      * @return the source text of this segment without white space.
668      * @deprecated  Use the more useful {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method instead.
669      */
670     public final String   getSourceTextNoWhitespace() {
671         return appendCollapseWhiteSpace(new StringBuffer  (length()),this).toString();
672     }
673 
674     /**
675      * Returns a list of <code>Segment</code> objects representing every word in this segment separated by {@linkplain #isWhiteSpace(char) white space}.
676      * Note that any markup contained in this segment is regarded as normal text for the purposes of this method.
677      * <p>
678      * This method has been deprecated as of version 2.0 as it has no discernable use.
679      *
680      * @return a list of <code>Segment</code> objects representing every word in this segment separated by white space.
681      * @deprecated  no replacement
682      */
683     public final List findWords() {
684         final ArrayList words=new ArrayList();
685         int wordBegin=-1;
686         for (int i=begin; i<end; i++) {
687             if (isWhiteSpace(source.charAt(i))) {
688                 if (wordBegin==-1) continue;
689                 words.add(new Segment(source,wordBegin,i));
690                 wordBegin=-1;
691             } else {
692                 if (wordBegin==-1) wordBegin=i;
693             }
694         }
695         if (wordBegin!=-1) words.add(new Segment(source, wordBegin,end));
696         return words;
697     }
698 
699     /**
700      * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
701      * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
702      */
703     static final StringBuffer   appendCollapseWhiteSpace(final StringBuffer   sb, final CharSequence   text) {
704         final int textLength=text.length();
705         int i=0;
706         boolean lastWasWhiteSpace=false;
707         while (true) {
708             if (i>=textLength) return sb;
709             if (!isWhiteSpace(text.charAt(i))) break;
710             i++;
711         }
712         do {
713             final char ch=text.charAt(i++);
714             if (isWhiteSpace(ch)) {
715                 lastWasWhiteSpace=true;
716             } else {
717                 if (lastWasWhiteSpace) {
718                     sb.append(' ');
719                     lastWasWhiteSpace=false;
720                 }
721                 sb.append(ch);
722             }
723         } while (i<textLength);
724         return sb;
725     }
726 
727     private Tag checkEnclosure(final Tag tag) {
728         if (tag==null || tag.end>end) return null;
729         return tag;
730     }
731 
732     private CharacterReference findNextCharacterReference(final int pos) {
733         final CharacterReference characterReference=source.findNextCharacterReference(pos);
734         if (characterReference==null || characterReference.end>end) return null;
735         return characterReference;
736     }
737 }
738 
739
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags