1 32 33 package websphinx; 34 35 import java.net.URL ; 36 import java.net.URLConnection ; 37 import java.net.HttpURLConnection ; 39 import java.io.IOException ; 41 import java.io.InputStream ; 42 import rcm.util.Str; 43 44 49 public class Page extends Region { 50 51 static final int TYPICAL_LENGTH = 20240; 53 54 Link origin; 56 long lastModified = 0; 57 long expiration = 0; 58 String contentType; 59 String contentEncoding; 60 int responseCode = -1; 61 String responseMessage = null; 62 URL base; 63 String title; 64 Link[] links; 65 66 int contentLock; 67 71 byte[] contentBytes; 73 String content; 74 Region[] tokens; 75 Text[] words; 76 Tag[] tags; 77 Element[] elements; 78 Element root; 79 String canonicalTags; 80 81 85 public Page (Link link) throws IOException { 86 this (link, DownloadParameters.NO_LIMITS, new HTMLParser ()); 87 } 88 89 94 public Page (Link link, DownloadParameters dp) throws IOException { 95 this (link, dp, new HTMLParser ()); 96 } 97 98 103 public Page (Link link, DownloadParameters dp, HTMLParser parser) throws IOException { 104 super (null, 0, 0); 105 source = this; 106 origin = link; 107 base = getURL (); 108 download (dp, parser); 109 link.setPage (this); 110 } 111 112 118 public Page (URL url, String html) { 119 this (url, html, new HTMLParser ()); 120 } 121 122 129 public Page (URL url, String html, HTMLParser parser) { 130 super (null, 0, html.length()); 131 source = this; 132 base = url; 133 this.content = html; 134 this.contentBytes = html.getBytes (); 135 contentLock = -1; 136 parse (parser); 137 } 138 139 143 public Page (String content) { 144 super (null, 0, content.length()); 145 source = this; 147 this.content = content; 148 this.contentBytes = content.getBytes (); 149 contentLock = -1; 150 } 151 152 156 public Page (byte[] content) { 157 super (null, 0, content.length); 158 source = this; 160 this.contentBytes = new byte[content.length]; 161 System.arraycopy (content, 0, this.contentBytes, 0, content.length); 162 this.content = new String (content); 163 contentLock = -1; 164 } 165 166 170 175 181 187 public void download (DownloadParameters dp, HTMLParser parser) throws IOException { 188 URLConnection conn = 189 Access.getAccess ().openConnection (origin); 190 191 InputStream in = conn.getInputStream (); 193 base = conn.getURL (); 194 lastModified = conn.getLastModified (); 195 expiration = conn.getExpiration (); 196 contentType = conn.getContentType (); 197 contentEncoding = conn.getContentEncoding (); 198 199 if (conn instanceof HttpURLConnection ) { 202 HttpURLConnection httpconn = (HttpURLConnection )conn; 203 204 responseCode = httpconn.getResponseCode (); 205 responseMessage = httpconn.getResponseMessage (); 206 if (responseMessage == null) 207 responseMessage = "unknown error"; 208 209 if (responseCode >= 300) 210 throw new IOException (responseCode + " " + responseMessage); 212 } 213 215 218 int maxKB = dp.getMaxPageSize (); 220 int maxBytes = (maxKB > 0) ? maxKB * 1024 : Integer.MAX_VALUE; 221 int expectedLength = conn.getContentLength (); 222 if (expectedLength > maxBytes) 223 throw new IOException ("Page greater than " 224 + maxBytes + " bytes"); 225 if (expectedLength == -1) 226 expectedLength = TYPICAL_LENGTH; 227 byte[] buf = new byte[expectedLength]; 228 int n; 229 int total = 0; 230 231 while ((n = in.read (buf, total, buf.length - total)) != -1) { 232 total += n; 233 if (total > maxBytes) 234 throw new IOException ("Page greater than " 235 + maxBytes + " bytes"); 236 if (total == buf.length) { 237 int c = in.read (); 239 if (c == -1) 240 break; else { 242 byte[] newbuf = new byte[Math.min (buf.length * 2, maxBytes)]; 245 System.arraycopy (buf, 0, newbuf, 0, buf.length); 246 buf = newbuf; 247 buf[total++] = (byte) c; 248 } 249 } 250 } 251 in.close (); 252 253 if (total != buf.length) { 254 byte[] newbuf = new byte[total]; 256 System.arraycopy (buf, 0, newbuf, 0, total); 257 buf = newbuf; 258 } 259 260 contentBytes = buf; 261 content = new String (buf); 262 start = 0; 263 end = total; 264 contentLock = 1; 265 266 if (contentType == null 268 || contentType.startsWith ("text/html") 269 || contentType.startsWith ("content/unknown")) 270 parse (parser); 271 } 272 273 void downloadSafely () { 274 try { 275 download (new DownloadParameters (), new HTMLParser ()); 276 } catch (Throwable e) { 277 } 278 } 279 280 284 289 public void parse (HTMLParser parser) { 290 if (!hasContent()) 291 downloadSafely (); 292 try { 293 parser.parse (this); 294 } catch (IOException e) { 295 throw new RuntimeException (e.toString()); 296 } 297 } 298 299 304 public boolean isParsed () { 305 return tokens != null; 306 } 307 308 312 public boolean isHTML () { 313 return root != null; 314 } 315 316 320 public boolean isImage () { 321 byte[] bytes = getContentBytes (); 322 return startsWith (bytes, GIF_MAGIC) || startsWith (bytes, JPG_MAGIC); 323 } 324 325 private static final byte[] GIF_MAGIC = { 326 (byte) 'G', (byte)'I', (byte)'F', (byte)'8' 327 }; 328 private static final byte[] JPG_MAGIC = { 329 (byte) 0377, (byte) 0330, (byte) 0377, 330 (byte) 0340, (byte) 0, (byte) 020, 331 (byte) 'J', (byte) 'F', (byte) 'I', (byte) 'F' 332 }; 333 334 private boolean startsWith (byte[] bytes, byte[] prefix) { 335 if (prefix.length > bytes.length) 336 return false; 337 for (int i = 0, n = prefix.length; i < n; ++i) 338 if (bytes[i] != prefix[i]) 339 return false; 340 return true; 341 } 342 343 347 353 public void keepContent () { 354 if (contentLock > 0) 355 ++contentLock; 356 } 357 358 373 public void discardContent () { 374 if (contentLock == 0) return; 376 377 if (--contentLock > 0) return; 379 380 if (origin == null) 381 return; 383 contentBytes = null; 385 content = null; 386 tokens = null; 387 tags = null; 388 words = null; 389 elements = null; 390 root = null; 391 canonicalTags = null; 392 393 if (links != null) { 395 for (int i=0; i<links.length; ++i) 396 if (links[i] instanceof Link) 397 ((Link)links[i]).discardContent (); 398 } 399 400 404 contentLock = 0; 405 } 406 407 412 public final boolean hasContent () { 413 return contentLock != 0; 414 } 415 416 420 424 public int getDepth () { 425 return origin != null ? origin.getDepth () : 0; 426 } 427 428 432 public Link getOrigin () { 433 return origin; 434 } 435 436 445 public URL getBase () { 446 return base; 447 } 448 449 453 public URL getURL () { 454 return origin != null ? origin.getURL() : null; 455 } 456 457 461 public String getTitle () { 462 return title; 463 } 464 465 470 public String getContent () { 471 if (!hasContent()) 472 downloadSafely (); 473 return content; 474 } 475 476 480 public byte[] getContentBytes () { 481 if (!hasContent()) 482 downloadSafely (); 483 return contentBytes; 484 } 485 486 490 public Region[] getTokens() { 491 if (!hasContent ()) 492 downloadSafely (); 493 return tokens; 494 } 495 496 500 public Tag[] getTags () { 501 if (!hasContent ()) 502 downloadSafely (); 503 return tags; 504 } 505 506 510 public Text[] getWords () { 511 if (!hasContent ()) 512 downloadSafely (); 513 return words; 514 } 515 516 523 public Element[] getElements () { 524 if (!hasContent ()) 525 downloadSafely (); 526 return elements; 527 } 528 529 534 public Element getRootElement () { 535 if (!hasContent ()) 536 downloadSafely (); 537 return root; 538 } 539 540 545 public Link[] getLinks() { 546 return links; 547 } 548 549 553 public String toURL () { 554 return origin != null ? origin.toURL () : null; 555 } 556 557 561 public String toDescription () { 562 return (title != null && title.length() > 0 ? title + " " : "") + "[" + getURL() + "]"; 563 } 564 565 569 public String toString () { 570 return getContent (); 571 } 572 573 578 public long getLastModified () { 579 return lastModified; 580 } 581 586 public void setLastModified (long last) { 587 lastModified = last; 588 } 589 590 595 public long getExpiration () { 596 return expiration; 597 } 598 603 public void setExpiration (long expire) { 604 expiration = expire; 605 } 606 607 611 public String getContentType () { 612 return contentType; 613 } 614 618 public void setContentType (String type) { 619 contentType = type; 620 } 621 622 626 public String getContentEncoding () { 627 return contentEncoding; 628 } 629 633 public void setContentEncoding (String encoding) { 634 contentEncoding = encoding; 635 } 636 637 644 public int getResponseCode () { 645 return responseCode; 646 } 647 648 652 public String getResponseMessage () { 653 return responseMessage; 654 } 655 656 662 public String substringContent (int start, int end) { 663 return getContent ().substring (start, end); 664 } 665 666 672 public String substringHTML (int start, int end) { 673 String s = getContent ().substring (start, end); 674 if (!isHTML ()) { 675 s = Str.replace (s, "&", "&"); 676 s = Str.replace (s, "<", "<"); 677 s = Str.replace (s, ">", ">"); 678 s = "<PRE>" + s + "</PRE>"; 679 } 680 return s; 681 } 682 683 690 public String substringText (int start, int end) { 691 if (words == null) 692 return ""; 694 StringBuffer buf = new StringBuffer (); 696 for (int j = findStart (words, start); j<words.length; ++j) { 697 if (words[j].end > end) 698 break; 699 else { 700 if (buf.length() > 0) 701 buf.append (' '); 702 buf.append (words[j].text); 703 } 704 } 705 return buf.toString(); 706 } 707 708 715 public String substringTags (int start, int end) { 716 if (tags == null) 717 return ""; 719 StringBuffer buf = new StringBuffer (); 721 for (int j = findStart (tags, start); j<tags.length; ++j) { 722 if (tags[j].end > end) 723 break; 724 else { 725 if (buf.length() > 0) 726 buf.append (' '); 727 buf.append (getContent ().substring (tags[j].start, tags[j].end)); 728 } 729 } 730 return buf.toString(); 731 } 732 733 764 public String substringCanonicalTags (int start, int end) { 765 if (tokens == null) 766 return ""; 768 boolean all = (start == this.start && end == this.end); 769 770 if (all && canonicalTags != null) 771 return canonicalTags; 772 773 StringBuffer buf = new StringBuffer (); 775 for (int j = findStart (tokens, start); j<tokens.length; ++j) { 776 if (tokens[j].end > end) 777 break; 778 else if (tokens[j] instanceof Tag) 779 Tagexp.canonicalizeTag (buf, (Tag)tokens[j], j); 780 } 781 782 String result = buf.toString (); 783 if (all) 784 canonicalTags = result; 785 return result; 786 } 787 788 public static void main (String [] args) throws Exception { 789 int method = Link.GET; 790 791 for (int i=0; i<args.length; ++i) { 792 if (args[i].equals ("-post")) 793 method = Link.POST; 794 else if (args[i].equals ("-get")) 795 method = Link.GET; 796 else { 797 Link link = method == Link.GET 798 ? new Link (args[i]) 799 : new Link (args[i]); try { 801 Page p = new Page (link); 802 System.out.write (p.getContentBytes ()); 803 } catch (IOException e) { 804 System.out.println (e); 805 } 806 } 807 } 808 } 809 810 } 811 | Popular Tags |