1 21 package au.id.jericho.lib.html; 22 23 import java.util.*; 24 25 30 public class Segment implements Comparable , CharSequence { 31 final int begin; 32 final int end; 33 final Source source; 34 35 List childElements=null; 36 37 private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; 39 45 public Segment(final Source source, final int begin, final int end) { 46 if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException (); 47 this.begin=begin; 48 this.end=end; 49 if (source==null) throw new IllegalArgumentException ("source argument must not be null"); 50 this.source=source; 51 } 52 53 Segment(final int length) { 55 begin=0; 56 this.end=length; 57 source=(Source)this; 58 } 59 60 Segment() { 62 begin=0; 63 end=0; 64 source=null; 65 } 66 67 71 public final int getBegin() { 72 return begin; 73 } 74 75 82 public final int getEnd() { 83 return end; 84 } 85 86 94 public final boolean equals(final Object object) { 95 if (object==null || !(object instanceof Segment)) return false; 96 final Segment segment=(Segment)object; 97 return segment.begin==begin && segment.end==end && segment.source==source; 98 } 99 100 108 public int hashCode() { 109 return begin+end; 110 } 111 112 117 public final int length() { 118 return end-begin; 119 } 120 121 129 public final boolean encloses(final Segment segment) { 130 return begin<=segment.begin && end>=segment.end; 131 } 132 133 141 public final boolean encloses(final int pos) { 142 return begin<=pos && pos<end; 143 } 144 145 156 public String toString() { 157 return source.string.substring(begin,end).toString(); 158 } 159 160 172 public String extractText() { 173 return extractText(false); 174 } 175 176 207 public String extractText(final boolean includeAttributes) { 208 final StringBuffer sb=new StringBuffer (length()); 209 int textBegin=begin; 210 for (final Iterator i=findAllTags().iterator(); i.hasNext();) { 212 final Tag tag=(Tag)i.next(); 213 final int textEnd=tag.begin; 214 if (textEnd<textBegin) continue; 215 while (textBegin<textEnd) sb.append(source.charAt(textBegin++)); 216 if (tag.getTagType()==StartTagType.NORMAL) { 217 if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE) { 218 final EndTag endTag=source.findNextEndTag(tag.end,tag.name,EndTagType.NORMAL); 219 if (endTag!=null) { 220 textBegin=endTag.end; 221 while (i.hasNext() && i.next()!=endTag) {} 222 continue; 223 } 224 } 225 if (includeAttributes) { 226 final StartTag startTag=(StartTag)tag; 227 final Attribute titleAttribute=startTag.getAttributes().get("title"); 229 if (titleAttribute!=null) sb.append(' ').append(titleAttribute.getValueSegment()).append(' '); 230 final Attribute altAttribute=startTag.getAttributes().get("alt"); 232 if (altAttribute!=null) sb.append(' ').append(altAttribute.getValueSegment()).append(' '); 233 final Attribute labelAttribute=startTag.getAttributes().get("label"); 235 if (labelAttribute!=null) sb.append(' ').append(labelAttribute.getValueSegment()).append(' '); 236 final Attribute summaryAttribute=startTag.getAttributes().get("summary"); 238 if (summaryAttribute!=null) sb.append(' ').append(summaryAttribute.getValueSegment()).append(' '); 239 } 241 } 242 if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append('\n'); 244 textBegin=tag.end; 245 } 246 while (textBegin<end) sb.append(source.charAt(textBegin++)); 247 final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb); 248 return decodedText; 249 } 250 251 258 public List findAllTags() { 259 return findAllTags(null); 260 } 261 262 272 public List findAllTags(final TagType tagType) { 273 Tag tag=checkEnclosure(Tag.findPreviousOrNextTag(source,begin,tagType,false)); 274 if (tag==null) return Collections.EMPTY_LIST; 275 final ArrayList list=new ArrayList(); 276 do { 277 list.add(tag); 278 tag=checkEnclosure(Tag.findPreviousOrNextTag(source,tag.begin+1,tagType,false)); 279 } while (tag!=null); 280 return list; 281 } 282 283 290 public List findAllStartTags() { 291 return findAllStartTags(null); 292 } 293 294 306 public List findAllStartTags(String name) { 307 if (name!=null) name=name.toLowerCase(); 308 final boolean isXMLTagName=Tag.isXMLName(name); 309 StartTag startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,begin,name,isXMLTagName,false)); 310 if (startTag==null) return Collections.EMPTY_LIST; 311 final ArrayList list=new ArrayList(); 312 do { 313 list.add(startTag); 314 startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,startTag.begin+1,name,isXMLTagName,false)); 315 } while (startTag!=null); 316 return list; 317 } 318 319 330 public List findAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) { 331 StartTag startTag=(StartTag)checkEnclosure(source.findNextStartTag(begin,attributeName,value,valueCaseSensitive)); 332 if (startTag==null) return Collections.EMPTY_LIST; 333 final ArrayList list=new ArrayList(); 334 do { 335 list.add(startTag); 336 startTag=(StartTag)checkEnclosure(source.findNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive)); 337 } while (startTag!=null); 338 return list; 339 } 340 341 353 public List getChildElements() { 354 if (childElements==null) { 355 if (length()==0) { 356 childElements=Collections.EMPTY_LIST; 357 } else { 358 childElements=new ArrayList(); 359 int pos=begin; 360 while (true) { 361 final StartTag childStartTag=source.findNextStartTag(pos); 362 if (childStartTag==null || childStartTag.begin>=end) break; 363 if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) { 364 pos=childStartTag.end; 365 continue; 366 } 367 final Element childElement=childStartTag.getElement(); 368 childElements.add(childElement); 369 childElement.getChildElements(); 370 pos=childElement.end; 371 } 372 } 373 } 374 return childElements; 375 } 376 377 384 public List findAllElements() { 385 return findAllElements((String )null); 386 } 387 388 400 public List findAllElements(String name) { 401 if (name!=null) name=name.toLowerCase(); 402 final List startTags=findAllStartTags(name); 403 if (startTags.isEmpty()) return Collections.EMPTY_LIST; 404 final ArrayList elements=new ArrayList(startTags.size()); 405 for (final Iterator i=startTags.iterator(); i.hasNext();) { 406 final StartTag startTag=(StartTag)i.next(); 407 final Element element=startTag.getElement(); 408 if (element.end>end) break; 409 elements.add(element); 410 } 411 return elements; 412 } 413 414 422 public List findAllElements(final StartTagType startTagType) { 423 final List startTags=findAllTags(startTagType); 424 if (startTags.isEmpty()) return Collections.EMPTY_LIST; 425 final ArrayList elements=new ArrayList(startTags.size()); 426 for (final Iterator i=startTags.iterator(); i.hasNext();) { 427 final StartTag startTag=(StartTag)i.next(); 428 final Element element=startTag.getElement(); 429 if (element.end>end) break; 430 elements.add(element); 431 } 432 return elements; 433 } 434 435 439 public List findAllCharacterReferences() { 440 CharacterReference characterReference=findNextCharacterReference(begin); 441 if (characterReference==null) return Collections.EMPTY_LIST; 442 final ArrayList list=new ArrayList(); 443 do { 444 list.add(characterReference); 445 characterReference=findNextCharacterReference(characterReference.end); 446 } while (characterReference!=null); 447 return list; 448 } 449 450 454 public List findFormControls() { 455 return FormControl.findAll(this); 456 } 457 458 466 public FormFields findFormFields() { 467 return new FormFields(findFormControls()); 468 } 469 470 479 public Attributes parseAttributes() { 480 return source.parseAttributes(begin,end); 481 } 482 483 502 public void ignoreWhenParsing() { 503 source.ignoreWhenParsing(begin,end); 504 } 505 506 525 public int compareTo(final Object o) { 526 if (this==o) return 0; 527 final Segment segment=(Segment)o; 528 if (begin<segment.begin) return -1; 529 if (begin>segment.begin) return 1; 530 if (end<segment.end) return -1; 531 if (end>segment.end) return 1; 532 return 0; 533 } 534 535 539 public final boolean isWhiteSpace() { 540 for (int i=begin; i<end; i++) 541 if (!isWhiteSpace(source.charAt(i))) return false; 542 return true; 543 } 544 545 566 public static final boolean isWhiteSpace(final char ch) { 567 for (int i=0; i<WHITESPACE.length; i++) 568 if (ch==WHITESPACE[i]) return true; 569 return false; 570 } 571 572 576 public String getDebugInfo() { 577 final StringBuffer sb=new StringBuffer (50); 578 sb.append('('); 579 source.getRowColumnVector(begin).appendTo(sb); 580 sb.append('-'); 581 source.getRowColumnVector(end).appendTo(sb); 582 sb.append(')'); 583 return sb.toString(); 584 } 585 586 599 public final char charAt(final int index) { 600 return source.string.charAt(begin+index); 601 } 602 603 617 public final CharSequence subSequence(final int beginIndex, final int endIndex) { 618 return source.string.subSequence(begin+beginIndex,begin+endIndex); 619 } 620 621 629 public boolean isComment() { 630 return false; } 632 633 641 public List findAllComments() { 642 return findAllTags(StartTagType.COMMENT); 643 } 644 645 653 public String getSourceText() { 654 return toString(); 655 } 656 657 670 public final String getSourceTextNoWhitespace() { 671 return appendCollapseWhiteSpace(new StringBuffer (length()),this).toString(); 672 } 673 674 683 public final List findWords() { 684 final ArrayList words=new ArrayList(); 685 int wordBegin=-1; 686 for (int i=begin; i<end; i++) { 687 if (isWhiteSpace(source.charAt(i))) { 688 if (wordBegin==-1) continue; 689 words.add(new Segment(source,wordBegin,i)); 690 wordBegin=-1; 691 } else { 692 if (wordBegin==-1) wordBegin=i; 693 } 694 } 695 if (wordBegin!=-1) words.add(new Segment(source, wordBegin,end)); 696 return words; 697 } 698 699 703 static final StringBuffer appendCollapseWhiteSpace(final StringBuffer sb, final CharSequence text) { 704 final int textLength=text.length(); 705 int i=0; 706 boolean lastWasWhiteSpace=false; 707 while (true) { 708 if (i>=textLength) return sb; 709 if (!isWhiteSpace(text.charAt(i))) break; 710 i++; 711 } 712 do { 713 final char ch=text.charAt(i++); 714 if (isWhiteSpace(ch)) { 715 lastWasWhiteSpace=true; 716 } else { 717 if (lastWasWhiteSpace) { 718 sb.append(' '); 719 lastWasWhiteSpace=false; 720 } 721 sb.append(ch); 722 } 723 } while (i<textLength); 724 return sb; 725 } 726 727 private Tag checkEnclosure(final Tag tag) { 728 if (tag==null || tag.end>end) return null; 729 return tag; 730 } 731 732 private CharacterReference findNextCharacterReference(final int pos) { 733 final CharacterReference characterReference=source.findNextCharacterReference(pos); 734 if (characterReference==null || characterReference.end>end) return null; 735 return characterReference; 736 } 737 } 738 739 | Popular Tags |