|                                                                                                              1
 21  package au.id.jericho.lib.html;
 22
 23  import java.util.*;
 24
 25
 30  public class Segment implements Comparable
  , CharSequence  { 31      final int begin;
 32      final int end;
 33      final Source source;
 34
 35      List childElements=null;
 36
 37      private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'};
 39
 45      public Segment(final Source source, final int begin, final int end) {
 46          if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException
  (); 47          this.begin=begin;
 48          this.end=end;
 49          if (source==null) throw new IllegalArgumentException
  ("source argument must not be null"); 50          this.source=source;
 51      }
 52
 53          Segment(final int length) {
 55          begin=0;
 56          this.end=length;
 57          source=(Source)this;
 58      }
 59
 60          Segment() {
 62          begin=0;
 63          end=0;
 64          source=null;
 65      }
 66
 67
 71      public final int getBegin() {
 72          return begin;
 73      }
 74
 75
 82      public final int getEnd() {
 83          return end;
 84      }
 85
 86
 94      public final boolean equals(final Object
  object) { 95          if (object==null || !(object instanceof Segment)) return false;
 96          final Segment segment=(Segment)object;
 97          return segment.begin==begin && segment.end==end && segment.source==source;
 98      }
 99
 100
 108     public int hashCode() {
 109         return begin+end;
 110     }
 111
 112
 117     public final int length() {
 118         return end-begin;
 119     }
 120
 121
 129     public final boolean encloses(final Segment segment) {
 130         return begin<=segment.begin && end>=segment.end;
 131     }
 132
 133
 141     public final boolean encloses(final int pos) {
 142         return begin<=pos && pos<end;
 143     }
 144
 145
 156     public String
  toString() { 157         return source.string.substring(begin,end).toString();
 158     }
 159
 160
 172     public String
  extractText() { 173         return extractText(false);
 174     }
 175
 176
 207     public String
  extractText(final boolean includeAttributes) { 208         final StringBuffer
  sb=new StringBuffer  (length()); 209         int textBegin=begin;
 210                 for (final Iterator i=findAllTags().iterator(); i.hasNext();) {
 212             final Tag tag=(Tag)i.next();
 213             final int textEnd=tag.begin;
 214             if (textEnd<textBegin) continue;
 215             while (textBegin<textEnd) sb.append(source.charAt(textBegin++));
 216             if (tag.getTagType()==StartTagType.NORMAL) {
 217                 if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE) {
 218                     final EndTag endTag=source.findNextEndTag(tag.end,tag.name,EndTagType.NORMAL);
 219                     if (endTag!=null) {
 220                         textBegin=endTag.end;
 221                         while (i.hasNext() && i.next()!=endTag) {}
 222                         continue;
 223                     }
 224                 }
 225                 if (includeAttributes) {
 226                     final StartTag startTag=(StartTag)tag;
 227                                         final Attribute titleAttribute=startTag.getAttributes().get("title");
 229                     if (titleAttribute!=null) sb.append(' ').append(titleAttribute.getValueSegment()).append(' ');
 230                                         final Attribute altAttribute=startTag.getAttributes().get("alt");
 232                     if (altAttribute!=null) sb.append(' ').append(altAttribute.getValueSegment()).append(' ');
 233                                         final Attribute labelAttribute=startTag.getAttributes().get("label");
 235                     if (labelAttribute!=null) sb.append(' ').append(labelAttribute.getValueSegment()).append(' ');
 236                                         final Attribute summaryAttribute=startTag.getAttributes().get("summary");
 238                     if (summaryAttribute!=null) sb.append(' ').append(summaryAttribute.getValueSegment()).append(' ');
 239                                     }
 241             }
 242                         if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append('\n');
 244             textBegin=tag.end;
 245         }
 246         while (textBegin<end) sb.append(source.charAt(textBegin++));
 247         final String
  decodedText=CharacterReference.decodeCollapseWhiteSpace(sb); 248         return decodedText;
 249     }
 250
 251
 258     public List findAllTags() {
 259         return findAllTags(null);
 260     }
 261
 262
 272     public List findAllTags(final TagType tagType) {
 273         Tag tag=checkEnclosure(Tag.findPreviousOrNextTag(source,begin,tagType,false));
 274         if (tag==null) return Collections.EMPTY_LIST;
 275         final ArrayList list=new ArrayList();
 276         do {
 277             list.add(tag);
 278             tag=checkEnclosure(Tag.findPreviousOrNextTag(source,tag.begin+1,tagType,false));
 279         } while (tag!=null);
 280         return list;
 281     }
 282
 283
 290     public List findAllStartTags() {
 291         return findAllStartTags(null);
 292     }
 293
 294
 306     public List findAllStartTags(String
  name) { 307         if (name!=null) name=name.toLowerCase();
 308         final boolean isXMLTagName=Tag.isXMLName(name);
 309         StartTag startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,begin,name,isXMLTagName,false));
 310         if (startTag==null) return Collections.EMPTY_LIST;
 311         final ArrayList list=new ArrayList();
 312         do {
 313             list.add(startTag);
 314             startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,startTag.begin+1,name,isXMLTagName,false));
 315         } while (startTag!=null);
 316         return list;
 317     }
 318
 319
 330     public List findAllStartTags(final String
  attributeName, final String  value, final boolean valueCaseSensitive) { 331         StartTag startTag=(StartTag)checkEnclosure(source.findNextStartTag(begin,attributeName,value,valueCaseSensitive));
 332         if (startTag==null) return Collections.EMPTY_LIST;
 333         final ArrayList list=new ArrayList();
 334         do {
 335             list.add(startTag);
 336             startTag=(StartTag)checkEnclosure(source.findNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
 337         } while (startTag!=null);
 338         return list;
 339     }
 340
 341
 353     public List getChildElements() {
 354         if (childElements==null) {
 355             if (length()==0) {
 356                 childElements=Collections.EMPTY_LIST;
 357             } else {
 358                 childElements=new ArrayList();
 359                 int pos=begin;
 360                 while (true) {
 361                     final StartTag childStartTag=source.findNextStartTag(pos);
 362                     if (childStartTag==null || childStartTag.begin>=end) break;
 363                     if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
 364                         pos=childStartTag.end;
 365                         continue;
 366                     }
 367                     final Element childElement=childStartTag.getElement();
 368                     childElements.add(childElement);
 369                     childElement.getChildElements();
 370                     pos=childElement.end;
 371                 }
 372             }
 373         }
 374         return childElements;
 375     }
 376
 377
 384     public List findAllElements() {
 385         return findAllElements((String
  )null); 386     }
 387
 388
 400     public List findAllElements(String
  name) { 401         if (name!=null) name=name.toLowerCase();
 402         final List startTags=findAllStartTags(name);
 403         if (startTags.isEmpty()) return Collections.EMPTY_LIST;
 404         final ArrayList elements=new ArrayList(startTags.size());
 405         for (final Iterator i=startTags.iterator(); i.hasNext();) {
 406             final StartTag startTag=(StartTag)i.next();
 407             final Element element=startTag.getElement();
 408             if (element.end>end) break;
 409             elements.add(element);
 410         }
 411         return elements;
 412     }
 413
 414
 422     public List findAllElements(final StartTagType startTagType) {
 423         final List startTags=findAllTags(startTagType);
 424         if (startTags.isEmpty()) return Collections.EMPTY_LIST;
 425         final ArrayList elements=new ArrayList(startTags.size());
 426         for (final Iterator i=startTags.iterator(); i.hasNext();) {
 427             final StartTag startTag=(StartTag)i.next();
 428             final Element element=startTag.getElement();
 429             if (element.end>end) break;
 430             elements.add(element);
 431         }
 432         return elements;
 433     }
 434
 435
 439     public List findAllCharacterReferences() {
 440         CharacterReference characterReference=findNextCharacterReference(begin);
 441         if (characterReference==null) return Collections.EMPTY_LIST;
 442         final ArrayList list=new ArrayList();
 443         do {
 444             list.add(characterReference);
 445             characterReference=findNextCharacterReference(characterReference.end);
 446         } while (characterReference!=null);
 447         return list;
 448     }
 449
 450
 454     public List findFormControls() {
 455         return FormControl.findAll(this);
 456     }
 457
 458
 466     public FormFields findFormFields() {
 467         return new FormFields(findFormControls());
 468     }
 469
 470
 479     public Attributes parseAttributes() {
 480         return source.parseAttributes(begin,end);
 481     }
 482
 483
 502     public void ignoreWhenParsing() {
 503         source.ignoreWhenParsing(begin,end);
 504     }
 505
 506
 525     public int compareTo(final Object
  o) { 526         if (this==o) return 0;
 527         final Segment segment=(Segment)o;
 528         if (begin<segment.begin) return -1;
 529         if (begin>segment.begin) return 1;
 530         if (end<segment.end) return -1;
 531         if (end>segment.end) return 1;
 532         return 0;
 533     }
 534
 535
 539     public final boolean isWhiteSpace() {
 540         for (int i=begin; i<end; i++)
 541             if (!isWhiteSpace(source.charAt(i))) return false;
 542         return true;
 543     }
 544
 545
 566     public static final boolean isWhiteSpace(final char ch) {
 567         for (int i=0; i<WHITESPACE.length; i++)
 568             if (ch==WHITESPACE[i]) return true;
 569         return false;
 570     }
 571
 572
 576     public String
  getDebugInfo() { 577         final StringBuffer
  sb=new StringBuffer  (50); 578         sb.append('(');
 579         source.getRowColumnVector(begin).appendTo(sb);
 580         sb.append('-');
 581         source.getRowColumnVector(end).appendTo(sb);
 582         sb.append(')');
 583         return sb.toString();
 584     }
 585
 586
 599     public final char charAt(final int index) {
 600         return source.string.charAt(begin+index);
 601     }
 602
 603
 617     public final CharSequence
  subSequence(final int beginIndex, final int endIndex) { 618         return source.string.subSequence(begin+beginIndex,begin+endIndex);
 619     }
 620
 621
 629     public boolean isComment() {
 630         return false;     }
 632
 633
 641     public List findAllComments() {
 642         return findAllTags(StartTagType.COMMENT);
 643     }
 644
 645
 653     public String
  getSourceText() { 654         return toString();
 655     }
 656
 657
 670     public final String
  getSourceTextNoWhitespace() { 671         return appendCollapseWhiteSpace(new StringBuffer
  (length()),this).toString(); 672     }
 673
 674
 683     public final List findWords() {
 684         final ArrayList words=new ArrayList();
 685         int wordBegin=-1;
 686         for (int i=begin; i<end; i++) {
 687             if (isWhiteSpace(source.charAt(i))) {
 688                 if (wordBegin==-1) continue;
 689                 words.add(new Segment(source,wordBegin,i));
 690                 wordBegin=-1;
 691             } else {
 692                 if (wordBegin==-1) wordBegin=i;
 693             }
 694         }
 695         if (wordBegin!=-1) words.add(new Segment(source, wordBegin,end));
 696         return words;
 697     }
 698
 699
 703     static final StringBuffer
  appendCollapseWhiteSpace(final StringBuffer  sb, final CharSequence  text) { 704         final int textLength=text.length();
 705         int i=0;
 706         boolean lastWasWhiteSpace=false;
 707         while (true) {
 708             if (i>=textLength) return sb;
 709             if (!isWhiteSpace(text.charAt(i))) break;
 710             i++;
 711         }
 712         do {
 713             final char ch=text.charAt(i++);
 714             if (isWhiteSpace(ch)) {
 715                 lastWasWhiteSpace=true;
 716             } else {
 717                 if (lastWasWhiteSpace) {
 718                     sb.append(' ');
 719                     lastWasWhiteSpace=false;
 720                 }
 721                 sb.append(ch);
 722             }
 723         } while (i<textLength);
 724         return sb;
 725     }
 726
 727     private Tag checkEnclosure(final Tag tag) {
 728         if (tag==null || tag.end>end) return null;
 729         return tag;
 730     }
 731
 732     private CharacterReference findNextCharacterReference(final int pos) {
 733         final CharacterReference characterReference=source.findNextCharacterReference(pos);
 734         if (characterReference==null || characterReference.end>end) return null;
 735         return characterReference;
 736     }
 737 }
 738
 739
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |