1 27 package org.htmlparser.lexer; 28 29 import java.io.InputStream ; 30 import java.io.IOException ; 31 import java.io.ObjectInputStream ; 32 import java.io.ObjectOutputStream ; 33 import java.io.Serializable ; 34 import java.io.UnsupportedEncodingException ; 35 import java.lang.reflect.InvocationTargetException ; 36 import java.lang.reflect.Method ; 37 import java.net.MalformedURLException ; 38 import java.net.URL ; 39 import java.net.URLConnection ; 40 import java.net.UnknownHostException ; 41 import java.util.zip.GZIPInputStream ; 42 import java.util.zip.InflaterInputStream ; 43 44 import org.htmlparser.http.ConnectionManager; 45 import org.htmlparser.util.ParserException; 46 47 52 public class Page 53 implements 54 Serializable 55 { 56 62 public static final String DEFAULT_CHARSET = "ISO-8859-1"; 63 64 68 public static final String DEFAULT_CONTENT_TYPE = "text/html"; 69 70 74 public static final char EOF = (char)Source.EOF; 75 76 81 protected String mUrl; 82 83 86 protected String mBaseUrl; 87 88 91 protected Source mSource; 92 93 96 protected PageIndex mIndex; 97 98 101 protected transient URLConnection mConnection; 102 103 106 public static ConnectionManager mConnectionManager = new ConnectionManager (); 107 108 111 public Page () 112 { 113 this (""); 114 } 115 116 127 public Page (URLConnection connection) throws ParserException 128 { 129 if (null == connection) 130 throw new IllegalArgumentException ("connection cannot be null"); 131 setConnection (connection); 132 mBaseUrl = null; 133 } 134 135 142 public Page (InputStream stream, String charset) 143 throws 144 UnsupportedEncodingException 145 { 146 if (null == stream) 147 throw new IllegalArgumentException ("stream cannot be null"); 148 if (null == charset) 149 charset = DEFAULT_CHARSET; 150 mSource = new InputStreamSource (stream, charset); 151 mIndex = new PageIndex (this); 152 mConnection = null; 153 mUrl = null; 154 mBaseUrl = null; 155 } 156 157 164 public Page (String text, String charset) 165 { 166 if (null == text) 167 throw new IllegalArgumentException ("text cannot be null"); 168 if (null == charset) 169 charset = DEFAULT_CHARSET; 170 mSource = new StringSource (text, charset); 171 mIndex = new PageIndex (this); 172 mConnection = null; 173 mUrl = null; 174 mBaseUrl = null; 175 } 176 177 183 public Page (String text) 184 { 185 this (text, null); 186 } 187 188 192 196 public static ConnectionManager getConnectionManager () 197 { 198 return (mConnectionManager); 199 } 200 201 205 public static void setConnectionManager (ConnectionManager manager) 206 { 207 mConnectionManager = manager; 208 } 209 210 234 public static String getCharset (String content) 235 { 236 final String CHARSET_STRING = "charset"; 237 int index; 238 String ret; 239 240 ret = DEFAULT_CHARSET; 241 if (null != content) 242 { 243 index = content.indexOf (CHARSET_STRING); 244 245 if (index != -1) 246 { 247 content = content.substring (index + CHARSET_STRING.length ()).trim (); 248 if (content.startsWith ("=")) 249 { 250 content = content.substring (1).trim (); 251 index = content.indexOf (";"); 252 if (index != -1) 253 content = content.substring (0, index); 254 255 if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ())) 257 content = content.substring (1, content.length () - 1); 258 259 if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ())) 261 content = content.substring (1, content.length () - 1); 262 263 ret = findCharset (content, ret); 264 265 } 277 } 278 } 279 280 return (ret); 281 } 282 283 291 public static String findCharset (String name, String _default) 292 { 293 String ret; 294 295 try 296 { 297 Class cls; 298 Method method; 299 Object object; 300 301 cls = Class.forName ("java.nio.charset.Charset"); 302 method = cls.getMethod ("forName", new Class [] { String .class }); 303 object = method.invoke (null, new Object [] { name }); 304 method = cls.getMethod ("name", new Class [] { }); 305 object = method.invoke (object, new Object [] { }); 306 ret = (String )object; 307 } 308 catch (ClassNotFoundException cnfe) 309 { 310 ret = name; 312 } 313 catch (NoSuchMethodException nsme) 314 { 315 ret = name; 317 } 318 catch (IllegalAccessException ia) 319 { 320 ret = name; 322 } 323 catch (InvocationTargetException ita) 324 { 325 ret = _default; 329 System.out.println ( 330 "unable to determine cannonical charset name for " 331 + name 332 + " - using " 333 + _default); 334 } 335 336 return (ret); 337 } 338 339 343 350 private void writeObject (ObjectOutputStream out) 351 throws 352 IOException 353 { 354 String href; 355 Source source; 356 PageIndex index; 357 358 if (null != getConnection ()) 360 { 361 out.writeBoolean (true); 362 out.writeInt (mSource.offset ()); href = getUrl (); 364 out.writeObject (href); 365 setUrl (getConnection ().getURL ().toExternalForm ()); 366 source = getSource (); 367 mSource = null; index = mIndex; 369 mIndex = null; out.defaultWriteObject (); 371 mSource = source; 372 mIndex = index; 373 } 374 else 375 { 376 out.writeBoolean (false); 377 href = getUrl (); 378 out.writeObject (href); 379 setUrl (null); out.defaultWriteObject (); 381 setUrl (href); 382 } 383 } 384 385 390 private void readObject (ObjectInputStream in) 391 throws 392 IOException , 393 ClassNotFoundException 394 { 395 boolean fromurl; 396 int offset; 397 String href; 398 URL url; 399 Cursor cursor; 400 401 fromurl = in.readBoolean (); 402 if (fromurl) 403 { 404 offset = in.readInt (); 405 href = (String )in.readObject (); 406 in.defaultReadObject (); 407 if (null != getUrl ()) 409 { 410 url = new URL (getUrl ()); 411 try 412 { 413 setConnection (url.openConnection ()); 414 } 415 catch (ParserException pe) 416 { 417 throw new IOException (pe.getMessage ()); 418 } 419 } 420 cursor = new Cursor (this, 0); 421 for (int i = 0; i < offset; i++) 422 try 423 { 424 getCharacter (cursor); 425 } 426 catch (ParserException pe) 427 { 428 throw new IOException (pe.getMessage ()); 429 } 430 setUrl (href); 431 } 432 else 433 { 434 href = (String )in.readObject (); 435 in.defaultReadObject (); 436 setUrl (href); 437 } 438 } 439 440 443 public void reset () 444 { 445 getSource ().reset (); 446 mIndex = new PageIndex (this); } 448 449 452 public void close () throws IOException 453 { 454 if (null != getSource ()) 455 getSource ().destroy (); 456 } 457 458 463 protected void finalize () throws Throwable 464 { 465 close (); 466 } 467 468 473 public URLConnection getConnection () 474 { 475 return (mConnection); 476 } 477 478 488 public void setConnection (URLConnection connection) 489 throws 490 ParserException 491 { 492 Stream stream; 493 String type; 494 String charset; 495 String contentEncoding; 496 497 mConnection = connection; 498 try 499 { 500 getConnection ().connect (); 501 } 502 catch (UnknownHostException uhe) 503 { 504 throw new ParserException ("Connect to " + mConnection.getURL ().toExternalForm () + " failed.", uhe); 505 } 506 catch (IOException ioe) 507 { 508 throw new ParserException ("Exception connecting to " + mConnection.getURL ().toExternalForm () + " (" + ioe.getMessage () + ").", ioe); 509 } 510 type = getContentType (); 511 charset = getCharset (type); 512 try 513 { 514 contentEncoding = connection.getContentEncoding(); 515 if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("gzip"))) 516 { 517 stream = new Stream (new GZIPInputStream (getConnection ().getInputStream ())); 518 } 519 else if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("deflate"))) 520 { 521 stream = new Stream (new InflaterInputStream (getConnection ().getInputStream ())); 522 } 523 else 524 { 525 stream = new Stream (getConnection ().getInputStream ()); 526 } 527 528 try 529 { 530 mSource = new InputStreamSource (stream, charset); 531 } 532 catch (UnsupportedEncodingException uee) 533 { 534 charset = DEFAULT_CHARSET; 544 mSource = new InputStreamSource (stream, charset); 545 } 546 } 547 catch (IOException ioe) 548 { 549 throw new ParserException ("Exception getting input stream from " + mConnection.getURL ().toExternalForm () + " (" + ioe.getMessage () + ").", ioe); 550 } 551 mUrl = connection.getURL ().toExternalForm (); 552 mIndex = new PageIndex (this); 553 } 554 555 563 public String getUrl () 564 { 565 return (mUrl); 566 } 567 568 574 public void setUrl (String url) 575 { 576 mUrl = url; 577 } 578 579 583 public String getBaseUrl () 584 { 585 return (mBaseUrl); 586 } 587 588 592 public void setBaseUrl (String url) 593 { 594 mBaseUrl = url; 595 } 596 597 600 public Source getSource () 601 { 602 return (mSource); 603 } 604 605 609 public String getContentType () 610 { 611 URLConnection connection; 612 String content; 613 String ret; 614 615 ret = DEFAULT_CONTENT_TYPE; 616 connection = getConnection (); 617 if (null != connection) 618 { 619 content = connection.getContentType (); 620 if (null != content) 621 ret = content; 622 } 623 624 return (ret); 625 } 626 627 640 public char getCharacter (Cursor cursor) 641 throws 642 ParserException 643 { 644 int i; 645 char ret; 646 647 i = cursor.getPosition (); 648 if (mSource.offset () < i) 649 throw new ParserException ("attempt to read future characters from source"); 651 else if (mSource.offset () == i) 652 try 653 { 654 i = mSource.read (); 655 if (Source.EOF == i) 656 ret = EOF; 657 else 658 { 659 ret = (char)i; 660 cursor.advance (); 661 } 662 } 663 catch (IOException ioe) 664 { 665 throw new ParserException ( 666 "problem reading a character at position " 667 + cursor.getPosition (), ioe); 668 } 669 else 670 { 671 try 673 { 674 ret = mSource.getCharacter (i); 675 } 676 catch (IOException ioe) 677 { 678 throw new ParserException ( 679 "can't read a character at position " 680 + i, ioe); 681 } 682 cursor.advance (); 683 } 684 685 if ('\r' == ret) 687 { ret = '\n'; 689 690 if (mSource.offset () == cursor.getPosition ()) 692 try 693 { 694 i = mSource.read (); 695 if (Source.EOF == i) 696 { 697 } 699 else if ('\n' == (char)i) 700 cursor.advance (); 701 else 702 try 703 { 704 mSource.unread (); 705 } 706 catch (IOException ioe) 707 { 708 throw new ParserException ( 709 "can't unread a character at position " 710 + cursor.getPosition (), ioe); 711 } 712 } 713 catch (IOException ioe) 714 { 715 throw new ParserException ( 716 "problem reading a character at position " 717 + cursor.getPosition (), ioe); 718 } 719 else 720 try 721 { 722 if ('\n' == mSource.getCharacter (cursor.getPosition ())) 723 cursor.advance (); 724 } 725 catch (IOException ioe) 726 { 727 throw new ParserException ( 728 "can't read a character at position " 729 + cursor.getPosition (), ioe); 730 } 731 } 732 if ('\n' == ret) 733 mIndex.add (cursor); 735 736 return (ret); 737 } 738 739 743 public String getEncoding () 744 { 745 return (getSource ().getEncoding ()); 746 } 747 748 769 public void setEncoding (String character_set) 770 throws 771 ParserException 772 { 773 getSource ().setEncoding (character_set); 774 } 775 776 783 public URL constructUrl (String link, String base) 784 throws MalformedURLException 785 { 786 String path; 787 boolean modified; 788 boolean absolute; 789 int index; 790 URL url; 792 url = new URL (new URL (base), link); 793 path = url.getFile (); 794 modified = false; 795 absolute = link.startsWith ("/"); 796 if (!absolute) 797 { while (path.startsWith ("/.")) 800 { 801 if (path.startsWith ("/../")) 802 { 803 path = path.substring (3); 804 modified = true; 805 } 806 else if (path.startsWith ("/./") || path.startsWith("/.")) 807 { 808 path = path.substring (2); 809 modified = true; 810 } 811 else 812 break; 813 } 814 } 815 while (-1 != (index = path.indexOf ("/\\"))) 817 { 818 path = path.substring (0, index + 1) + path.substring (index + 2); 819 modified = true; 820 } 821 if (modified) 822 url = new URL (url, path); 823 824 return (url); 825 } 826 827 833 public String getAbsoluteURL (String link) 834 { 835 String base; 836 URL url; 837 String ret; 838 839 if ((null == link) || ("".equals (link))) 840 ret = ""; 841 else 842 try 843 { 844 base = getBaseUrl (); 845 if (null == base) 846 base = getUrl (); 847 if (null == base) 848 ret = link; 849 else 850 { 851 url = constructUrl (link, base); 852 ret = url.toExternalForm (); 853 } 854 } 855 catch (MalformedURLException murle) 856 { 857 ret = link; 858 } 859 860 return (ret); 861 } 862 863 868 public int row (Cursor cursor) 869 { 870 return (mIndex.row (cursor)); 871 } 872 873 878 public int row (int position) 879 { 880 return (mIndex.row (position)); 881 } 882 883 888 public int column (Cursor cursor) 889 { 890 return (mIndex.column (cursor)); 891 } 892 893 898 public int column (int position) 899 { 900 return (mIndex.column (position)); 901 } 902 903 914 public String getText (int start, int end) 915 { 916 String ret; 917 918 try 919 { 920 ret = mSource.getString (start, end - start); 921 } 922 catch (IOException ioe) 923 { 924 throw new IllegalArgumentException ( 925 "can't get the " 926 + (end - start) 927 + "characters at position " 928 + start 929 + " - " 930 + ioe.getMessage ()); 931 } 932 933 return (ret); 934 } 935 936 946 public void getText (StringBuffer buffer, int start, int end) 947 { 948 int length; 949 950 if ((mSource.offset () < start) || (mSource.offset () < end)) 951 throw new IllegalArgumentException ("attempt to extract future characters from source"); 952 if (end < start) 953 { 954 length = end; 955 end = start; 956 start = length; 957 } 958 length = end - start; 959 try 960 { 961 mSource.getCharacters (buffer, start, length); 962 } 963 catch (IOException ioe) 964 { 965 throw new IllegalArgumentException ( 966 "can't get the " 967 + (end - start) 968 + "characters at position " 969 + start 970 + " - " 971 + ioe.getMessage ()); 972 } 973 } 974 975 980 public String getText () 981 { 982 return (getText (0, mSource.offset ())); 983 } 984 985 990 public void getText (StringBuffer buffer) 991 { 992 getText (buffer, 0, mSource.offset ()); 993 } 994 995 1006 public void getText (char[] array, int offset, int start, int end) 1007 { 1008 int length; 1009 1010 if ((mSource.offset () < start) || (mSource.offset () < end)) 1011 throw new IllegalArgumentException ("attempt to extract future characters from source"); 1012 if (end < start) 1013 { length = end; 1015 end = start; 1016 start = length; 1017 } 1018 length = end - start; 1019 try 1020 { 1021 mSource.getCharacters (array, offset, start, end); 1022 } 1023 catch (IOException ioe) 1024 { 1025 throw new IllegalArgumentException ( 1026 "can't get the " 1027 + (end - start) 1028 + "characters at position " 1029 + start 1030 + " - " 1031 + ioe.getMessage ()); 1032 } 1033 } 1034 1035 1041 public String getLine (Cursor cursor) 1042 { 1043 int line; 1044 int size; 1045 int start; 1046 int end; 1047 1048 line = row (cursor); 1049 size = mIndex.size (); 1050 if (line < size) 1051 { 1052 start = mIndex.elementAt (line); 1053 line++; 1054 if (line <= size) 1055 end = mIndex.elementAt (line); 1056 else 1057 end = mSource.offset (); 1058 } 1059 else { 1061 start = mIndex.elementAt (line - 1); 1062 end = mSource.offset (); 1063 } 1064 1065 1066 return (getText (start, end)); 1067 } 1068 1069 1075 public String getLine (int position) 1076 { 1077 return (getLine (new Cursor (this, position))); 1078 } 1079 1080 1084 public String toString () 1085 { 1086 StringBuffer buffer; 1087 int start; 1088 String ret; 1089 1090 if (mSource.offset () > 0) 1091 { 1092 buffer = new StringBuffer (43); 1093 start = mSource.offset () - 40; 1094 if (0 > start) 1095 start = 0; 1096 else 1097 buffer.append ("..."); 1098 getText (buffer, start, mSource.offset ()); 1099 ret = buffer.toString (); 1100 } 1101 else 1102 ret = super.toString (); 1103 1104 return (ret); 1105 } 1106} 1107 | Popular Tags |