1 21 package au.id.jericho.lib.html; 22 23 import java.util.*; 24 import java.io.*; 25 import java.net.*; 26 27 54 public class Source extends Segment { 55 final String string; 56 String documentSpecifiedEncoding=UNINITIALISED; 57 String encoding=UNINITIALISED; 58 String encodingSpecificationInfo; 59 private ParseText parseText=null; 60 private OutputDocument parseTextOutputDocument=null; 61 private Writer logWriter=null; 62 private RowColumnVector[] rowColumnVectorCacheArray=null; 63 final Cache cache=new Cache(this); 64 boolean useAllTypesCache=true; 65 boolean useSpecialTypesCache=true; 66 int endOfLastTagIgnoringEnclosedMarkup=-1; Tag[] allTagsArray=null; List allTags=null; 70 List allStartTags=null; 71 private List allElements=null; 72 73 private static final String UNINITIALISED=""; 74 75 80 public Source(final CharSequence text) { 81 super(text.length()); 82 string=text.toString(); 83 } 84 85 private Source(final EncodedSource encodedSource) throws IOException { 86 this(Util.getString(encodedSource.Reader)); 87 encoding=encodedSource.Encoding; 88 encodingSpecificationInfo=encodedSource.EncodingSpecificationInfo; 89 } 91 92 private Source(final Reader reader, final String inputStreamReaderEncoding) throws IOException { 93 this(Util.getString(reader)); 94 if (inputStreamReaderEncoding!=null) { 95 encoding=inputStreamReaderEncoding; 96 encodingSpecificationInfo="InputStreamReader.getEncoding() of constructor argument"; 97 } 98 } 99 100 110 public Source(final Reader reader) throws IOException { 111 this(reader,(reader instanceof InputStreamReader) ? ((InputStreamReader)reader).getEncoding() : null); 112 } 113 114 129 public Source(final InputStream inputStream) throws IOException { 130 this(EncodedSource.construct(inputStream,null)); 131 } 132 133 188 public Source(final URL url) throws IOException { 189 this(EncodedSource.construct(url)); 190 } 191 192 private String setEncoding(final String encoding, final String encodingSpecificationInfo) { 193 if (this.encoding==UNINITIALISED) { 194 this.encoding=encoding; 195 this.encodingSpecificationInfo=encodingSpecificationInfo; 196 } 197 return encoding; 198 } 199 200 229 public String getDocumentSpecifiedEncoding() { 230 if (documentSpecifiedEncoding!=UNINITIALISED) return documentSpecifiedEncoding; 231 final Tag xmlDeclarationTag=getTagAt(0); 232 if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) { 233 documentSpecifiedEncoding=((StartTag)xmlDeclarationTag).getAttributeValue("encoding"); 234 if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,xmlDeclarationTag.toString()); 235 } 236 final StartTag contentTypeMetaTag=findNextStartTag(0,"http-equiv","Content-Type",false); 238 if (contentTypeMetaTag!=null) { 239 final String contentValue=contentTypeMetaTag.getAttributeValue("content"); 240 if (contentValue!=null) { 241 documentSpecifiedEncoding=getCharsetParameterFromHttpHeaderValue(contentValue); 242 if (documentSpecifiedEncoding!=null) return setEncoding(documentSpecifiedEncoding,contentTypeMetaTag.toString()); 243 } 244 } 245 return setEncoding(null,"no encoding specified in document"); 246 } 247 248 275 public String getEncoding() { 276 if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding(); 277 return encoding; 278 } 279 280 289 public String getEncodingSpecificationInfo() { 290 if (encoding==UNINITIALISED) getDocumentSpecifiedEncoding(); 291 return encodingSpecificationInfo; 292 } 293 294 315 public boolean isXML() { 316 final Tag xmlDeclarationTag=getTagAt(0); 317 if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) return true; 318 final Tag doctypeTag=findNextTag(0,StartTagType.DOCTYPE_DECLARATION); 319 if (doctypeTag!=null && getParseText().indexOf("xhtml",doctypeTag.begin,doctypeTag.end)!=-1) return true; 321 return findNextStartTag(0,HTMLElementName.HTML)==null; 323 } 324 325 333 public int getRow(final int pos) { 334 return getRowColumnVector(pos).getRow(); 335 } 336 337 345 public int getColumn(final int pos) { 346 return getRowColumnVector(pos).getColumn(); 347 } 348 349 357 public RowColumnVector getRowColumnVector(final int pos) { 358 if (pos>end) throw new IndexOutOfBoundsException (); 359 if (rowColumnVectorCacheArray==null) rowColumnVectorCacheArray=RowColumnVector.getCacheArray(this); 360 return RowColumnVector.get(rowColumnVectorCacheArray,pos); 361 } 362 363 367 public String toString() { 368 return string; 369 } 370 371 412 public Tag[] fullSequentialParse() { 413 final boolean assumeNoNestedTags=false; 421 if (cache.getTagCount()!=0) cache.clear(); 422 final boolean useAllTypesCacheSave=useAllTypesCache; 423 try { 424 useAllTypesCache=false; 425 useSpecialTypesCache=false; 426 return Tag.parseAll(this,assumeNoNestedTags); 427 } finally { 428 useAllTypesCache=useAllTypesCacheSave; 429 useSpecialTypesCache=true; 430 endOfLastTagIgnoringEnclosedMarkup=-1; 431 } 432 } 433 434 466 public List getChildElements() { 467 if (childElements==null) { 468 if (length()==0) { 469 childElements=Collections.EMPTY_LIST; 470 } else { 471 if (allTags==null) log("NOTE: Calling Source.fullSequentialParse() can significantly improve the performance of this operation"); 472 childElements=new ArrayList(); 473 int pos=0; 474 while (true) { 475 final StartTag childStartTag=source.findNextStartTag(pos); 476 if (childStartTag==null) break; 477 if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) { 478 pos=childStartTag.end; 479 continue; 480 } 481 final Element childElement=childStartTag.getElement(); 482 childElement.parentElement=null; 483 childElements.add(childElement); 484 childElement.getChildElements(0); 485 pos=childElement.end; 486 } 487 } 488 } 489 return childElements; 490 } 491 492 501 public List findAllTags() { 502 if (allTags==null) { 503 log("NOTE: Calling Source.fullSequentialParse() can significantly improve the performance of this operation"); 504 allTags=super.findAllTags(); 505 } 506 return allTags; 507 } 508 509 518 public List findAllStartTags() { 519 if (allStartTags==null) { 520 final List allTags=findAllTags(); 521 allStartTags=new ArrayList(allTags.size()); 522 for (final Iterator i=allTags.iterator(); i.hasNext();) { 523 final Object next=i.next(); 524 if (next instanceof StartTag) allStartTags.add(next); 525 } 526 } 527 return allStartTags; 528 } 529 530 539 public List findAllElements() { 540 if (allElements==null) { 541 final List allStartTags=findAllStartTags(); 542 if (allStartTags.isEmpty()) return Collections.EMPTY_LIST; 543 allElements=new ArrayList(allStartTags.size()); 544 for (final Iterator i=allStartTags.iterator(); i.hasNext();) { 545 final StartTag startTag=(StartTag)i.next(); 546 allElements.add(startTag.getElement()); 547 } 548 } 549 return allElements; 550 } 551 552 566 public Element getElementById(final String id) { 567 final StartTag startTag=findNextStartTag(0,Attribute.ID,id,true); 568 return startTag==null ? null : startTag.getElement(); 569 } 570 571 581 public final Tag getTagAt(final int pos) { 582 return Tag.getTagAt(this,pos); 583 } 584 585 593 public Tag findPreviousTag(final int pos) { 594 return Tag.findPreviousOrNextTag(this,pos,true); 595 } 596 597 606 public Tag findPreviousTag(final int pos, final TagType tagType) { 607 return Tag.findPreviousOrNextTag(this,pos,tagType,true); 608 } 609 610 618 public Tag findNextTag(final int pos) { 619 return Tag.findPreviousOrNextTag(this,pos,false); 620 } 621 622 631 public Tag findNextTag(final int pos, final TagType tagType) { 632 return Tag.findPreviousOrNextTag(this,pos,tagType,false); 633 } 634 635 643 public Tag findEnclosingTag(final int pos) { 644 return findEnclosingTag(pos,null); 645 } 646 647 656 public Tag findEnclosingTag(final int pos, final TagType tagType) { 657 final Tag tag=findPreviousTag(pos,tagType); 658 if (tag==null || tag.end<=pos) return null; 659 return tag; 660 } 661 662 671 public Element findNextElement(final int pos) { 672 final StartTag startTag=findNextStartTag(pos); 673 return startTag==null ? null : startTag.getElement(); 674 } 675 676 694 public Element findNextElement(final int pos, String name) { 695 final StartTag startTag=findNextStartTag(pos,name); 696 return startTag==null ? null : startTag.getElement(); 697 } 698 699 707 public StartTag findPreviousStartTag(final int pos) { 708 return StartTag.findPreviousOrNext(this,pos,true); 709 } 710 711 725 public StartTag findPreviousStartTag(final int pos, String name) { 726 if (name!=null) name=name.toLowerCase(); 727 final boolean isXMLTagName=Tag.isXMLName(name); 728 return StartTag.findPreviousOrNext(this,pos,name,isXMLTagName,true); 729 } 730 731 739 public StartTag findNextStartTag(final int pos) { 740 return StartTag.findPreviousOrNext(this,pos,false); 741 } 742 743 760 public StartTag findNextStartTag(final int pos, String name) { 761 if (name!=null) name=name.toLowerCase(); 762 final boolean isXMLTagName=Tag.isXMLName(name); 763 return StartTag.findPreviousOrNext(this,pos,name,isXMLTagName,false); 764 } 765 766 777 public StartTag findNextStartTag(final int pos, final String attributeName, final String value, final boolean valueCaseSensitive) { 778 return StartTag.findNext(this,pos,attributeName,value,valueCaseSensitive); 779 } 780 781 789 public EndTag findPreviousEndTag(final int pos) { 790 return EndTag.findPreviousOrNext(this,pos,true); 791 } 792 793 802 public EndTag findPreviousEndTag(final int pos, final String name) { 803 if (name==null) throw new IllegalArgumentException ("name argument must not be null"); 804 return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),EndTagType.NORMAL,true); 805 } 806 807 815 public EndTag findNextEndTag(final int pos) { 816 return EndTag.findPreviousOrNext(this,pos,false); 817 } 818 819 828 public EndTag findNextEndTag(final int pos, final String name) { 829 return findNextEndTag(pos,name,EndTagType.NORMAL); 830 } 831 832 842 public EndTag findNextEndTag(final int pos, final String name, final EndTagType endTagType) { 843 if (name==null) throw new IllegalArgumentException ("name argument must not be null"); 844 return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),endTagType,false); 845 } 846 847 859 public Element findEnclosingElement(final int pos) { 860 return findEnclosingElement(pos,null); 861 } 862 863 878 public Element findEnclosingElement(final int pos, String name) { 879 int startBefore=pos; 880 if (name!=null) name=name.toLowerCase(); 881 final boolean isXMLTagName=Tag.isXMLName(name); 882 while (true) { 883 StartTag startTag=StartTag.findPreviousOrNext(this,startBefore,name,isXMLTagName,true); 884 if (startTag==null) return null; 885 Element element=startTag.getElement(); 886 if (pos < element.end) return element; 887 startBefore=startTag.begin-1; 888 } 889 } 890 891 899 public CharacterReference findPreviousCharacterReference(final int pos) { 900 return CharacterReference.findPreviousOrNext(this,pos,true); 901 } 902 903 911 public CharacterReference findNextCharacterReference(final int pos) { 912 return CharacterReference.findPreviousOrNext(this,pos,false); 913 } 914 915 929 public int findNameEnd(int pos) { 930 if (!Tag.isXMLNameStartChar(string.charAt(pos++))) return -1; 931 while (pos<string.length() && Tag.isXMLNameChar(string.charAt(pos))) pos++; 932 return pos; 933 } 934 935 957 public Attributes parseAttributes(final int pos, final int maxEnd) { 958 return parseAttributes(pos,maxEnd,Attributes.getDefaultMaxErrorCount()); 959 } 960 961 980 public Attributes parseAttributes(final int pos, final int maxEnd, final int maxErrorCount) { 981 return Attributes.construct(this,pos,maxEnd,maxErrorCount); 982 } 983 984 992 public void ignoreWhenParsing(final int begin, final int end) { 993 if (parseTextOutputDocument==null) { 994 parseTextOutputDocument=new OutputDocument(getParseText()); 995 parseText=null; 996 } 997 parseTextOutputDocument.replaceWithSpaces(begin,end); 998 } 999 1000 1005 public void ignoreWhenParsing(final Collection segments) { 1006 for (final Iterator i=segments.iterator(); i.hasNext();) { 1007 ((Segment)i.next()).ignoreWhenParsing(); 1008 } 1009 } 1010 1011 1062 public CharStreamSource indent(final String indentText, final boolean tidyTags, final boolean collapseWhiteSpace, final boolean indentAllElements) { 1063 return new Indent(this,indentText,tidyTags,collapseWhiteSpace,indentAllElements); 1064 } 1065 1066 1073 public Writer getLogWriter() { 1074 return logWriter; 1075 } 1076 1077 1085 public void setLogWriter(final Writer writer) { 1086 logWriter=writer; 1087 } 1088 1089 1099 public boolean isLoggingEnabled() { 1100 return logWriter!=null; 1101 } 1102 1103 1116 public void log(final String message) { 1117 if (logWriter==null) return; 1118 try { 1119 logWriter.write(message); 1120 logWriter.write('\n'); 1121 logWriter.flush(); 1122 } catch (IOException ex) { 1123 throw new RuntimeException (ex); 1124 } 1125 } 1126 1127 1133 public void clearCache() { 1134 cache.clear(); 1135 allTagsArray=null; 1136 allTags=null; 1137 allStartTags=null; 1138 allElements=null; 1139 } 1140 1141 1145 public String getCacheDebugInfo() { 1146 return cache.toString(); 1147 } 1148 1149 1158 List getParsedTags() { 1159 final ArrayList list=new ArrayList(); 1160 for (final Iterator i=cache.getTagIterator(); i.hasNext();) list.add(i.next()); 1161 return list; 1162 } 1163 1164 1174 public final ParseText getParseText() { 1175 if (parseText==null) { 1176 if (parseTextOutputDocument!=null) { 1177 parseText=new ParseText(parseTextOutputDocument); 1178 parseTextOutputDocument=null; 1179 } else { 1180 parseText=new ParseText(this); 1181 } 1182 } 1183 return parseText; 1184 } 1185 1186 1203 public StartTag findEnclosingStartTag(final int pos) { 1204 final StartTag startTag=findPreviousStartTag(pos); 1205 if (startTag==null || startTag.end<=pos) return null; 1206 return startTag; 1207 } 1208 1209 1218 public StartTag findNextComment(final int pos) { 1219 return (StartTag)findNextTag(pos,StartTagType.COMMENT); 1220 } 1221 1222 1231 public Segment findEnclosingComment(final int pos) { 1232 return findEnclosingTag(pos,StartTagType.COMMENT); 1233 } 1234 1235 1248 public Iterator getNextTagIterator(final int pos) { 1249 return Tag.getNextTagIterator(this,pos); 1250 } 1251 1252 static String getCharsetParameterFromHttpHeaderValue(final String httpHeaderValue) { 1253 final int charsetParameterPos=httpHeaderValue.toLowerCase().indexOf("charset="); 1254 if (charsetParameterPos==-1) return null; 1255 final int charsetBegin=charsetParameterPos+8; 1256 int charsetEnd=httpHeaderValue.indexOf(';',charsetBegin); 1257 final String charset=(charsetEnd==-1) ? httpHeaderValue.substring(charsetBegin) : httpHeaderValue.substring(charsetBegin,charsetEnd); 1258 return charset.trim(); 1259 } 1260} 1261 | Popular Tags |