ExtractorUniversal


1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jan 15, 2004
20   *
21   */
22  package org.archive.crawler.extractor;
23  
24  import java.io.IOException  ;
25  import java.io.InputStream  ;
26  import java.util.regex.Matcher  ;
27  
28  import javax.management.AttributeNotFoundException  ;
29  
30  import org.archive.crawler.datamodel.CoreAttributeConstants;
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.settings.SimpleType;
33  import org.archive.crawler.settings.Type;
34  import org.archive.net.UURI;
35  import org.archive.util.TextUtils;
36  
37  /**
38   * A last ditch extractor that will look at the raw byte code and try to extract
39   * anything that <i>looks</i> like a link.
40   *
41   * If used, it should always be specified as the last link extractor in the
42   * order file.
43   * <p>
44   * To accomplish this it will scan through the bytecode and try and build up
45   * strings of consecutive bytes that all represent characters that are valid
46   * in a URL (see #isURLableChar(int) for details).
47   * Once it hits the end of such a string (i.e. finds a character that
48   * should not be in a URL) it will try to determine if it has found a URL.
49   * This is done be seeing if the string is an IP address prefixed with
50   * http(s):// or contains a dot followed by a Top Level Domain and end of
51   * string or a slash.
52   *
53   * @author Kristinn Sigurdsson
54   */
55  public class ExtractorUniversal extends Extractor
56  implements CoreAttributeConstants {
57  
58      private static final long serialVersionUID = -7593380118857156939L;
59  
60  //    private static final Logger logger =
61  //        Logger.getLogger(ExtractorUniversal.class.getName());
62      
63      private static String   ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
64  
65      /** Default value for how far into an unknown document we should scan
66       * - 10k. A value of 0 or lower will disable this.
67       */
68      private static long DEFAULT_MAX_DEPTH_BYTES = 10240;
69  
70      private static String   ATTR_MAX_URL_LENGTH = "max-url-length";
71  
72      /** Maximum length for a URI that we try to match.*/
73      private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;
74  
75      /**
76       * Matches any string that begins with http:// or https:// followed by
77       * something that looks like an ip address (four numbers, none longer then
78       * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are
79       * each in the range 0-255.
80       */
81      static final String   IP_ADDRESS =
82          "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)";
83  
84      /**
85       * Matches any string that begins with a TLD (no .) followed by a '/' slash
86       * or end of string. If followed by slash then nothing after the slash is
87       * of consequence.
88       */
89      public static final String   TLDs =
90            "(ac(/.*)?)"  // ac  Ascension Island
91          + "|(ad(/.*)?)" // ad  Andorra
92          + "|(ae(/.*)?)" // ae  United Arab Emirates
93          + "|(af(/.*)?)" // af  Afghanistan
94          + "|(ag(/.*)?)" // ag  Antigua and Barbuda
95          + "|(ai(/.*)?)" // ai  Anguilla
96          + "|(al(/.*)?)" // al  Albania
97          + "|(am(/.*)?)" // am  Armenia
98          + "|(an(/.*)?)" // an  Netherlands Antilles
99          + "|(ao(/.*)?)" // ao  Angola
100         + "|(aero(/.*)?)" // aero Air-transport industry
101         + "|(aq(/.*)?)" // aq  Antarctica
102         + "|(ar(/.*)?)" // ar  Argentina
103         + "|(as(/.*)?)" // as  American Samoa
104         + "|(at(/.*)?)" // at  Austria
105         + "|(au(/.*)?)" // au  Australia
106         + "|(aw(/.*)?)" // aw  Aruba
107         + "|(az(/.*)?)" // az  Azerbaijan
108         + "|(ba(/.*)?)" // ba  Bosnia Hercegovina
109         + "|(bb(/.*)?)" // bb  Barbados
110         + "|(bd(/.*)?)" // bd  Bangladesh
111         + "|(be(/.*)?)" // be  Belgium
112         + "|(bf(/.*)?)" // bf  Burkina Faso
113         + "|(bg(/.*)?)" // bg  Bulgaria
114         + "|(bh(/.*)?)" // bh  Bahrain
115         + "|(bi(/.*)?)" // bi  Burundi
116         + "|(biz(/.*)?)" // biz Businesses
117         + "|(bj(/.*)?)" // bj  Benin
118         + "|(bm(/.*)?)" // bm  Bermuda
119         + "|(bn(/.*)?)" // bn  Brunei Darussalam
120         + "|(bo(/.*)?)" // bo  Bolivia
121         + "|(br(/.*)?)" // br  Brazil
122         + "|(bs(/.*)?)" // bs  Bahamas
123         + "|(bt(/.*)?)" // bt  Bhutan
124         + "|(bv(/.*)?)" // bv  Bouvet Island
125         + "|(bw(/.*)?)" // bw  Botswana
126         + "|(by(/.*)?)" // by  Belarus (Byelorussia)
127         + "|(bz(/.*)?)" // bz  Belize
128         + "|(ca(/.*)?)" // ca  Canada
129         + "|(cc(/.*)?)" // cc  Cocos Islands (Keeling)
130         + "|(cd(/.*)?)" // cd  Congo, Democratic Republic of the
131         + "|(cf(/.*)?)" // cf  Central African Republic
132         + "|(cg(/.*)?)" // cg  Congo, Republic of
133         + "|(ch(/.*)?)" // ch  Switzerland
134         + "|(ci(/.*)?)" // ci  Cote d'Ivoire (Ivory Coast)
135         + "|(ck(/.*)?)" // ck  Cook Islands
136         + "|(cl(/.*)?)" // cl  Chile
137         + "|(cm(/.*)?)" // cm  Cameroon
138         + "|(cn(/.*)?)" // cn  China
139         + "|(co(/.*)?)" // co  Colombia
140         + "|(com(/.*)?)" // com Commercial
141         + "|(coop(/.*)?)" // coop Cooperatives
142         + "|(cr(/.*)?)" // cr  Costa Rica
143         + "|(cs(/.*)?)" // cs  Czechoslovakia
144         + "|(cu(/.*)?)" // cu  Cuba
145         + "|(cv(/.*)?)" // cv  Cap Verde
146         + "|(cx(/.*)?)" // cx  Christmas Island
147         + "|(cy(/.*)?)" // cy  Cyprus
148         + "|(cz(/.*)?)" // cz  Czech Republic
149         + "|(de(/.*)?)" // de  Germany
150         + "|(dj(/.*)?)" // dj  Djibouti
151         + "|(dk(/.*)?)" // dk  Denmark
152         + "|(dm(/.*)?)" // dm  Dominica
153         + "|(do(/.*)?)" // do  Dominican Republic
154         + "|(dz(/.*)?)" // dz  Algeria
155         + "|(ec(/.*)?)" // ec  Ecuador
156         + "|(edu(/.*)?)" // edu Educational Institution
157         + "|(ee(/.*)?)" // ee  Estonia
158         + "|(eg(/.*)?)" // eg  Egypt
159         + "|(eh(/.*)?)" // eh  Western Sahara
160         + "|(er(/.*)?)" // er  Eritrea
161         + "|(es(/.*)?)" // es  Spain
162         + "|(et(/.*)?)" // et  Ethiopia
163         + "|(fi(/.*)?)" // fi  Finland
164         + "|(fj(/.*)?)" // fj  Fiji
165         + "|(fk(/.*)?)" // fk  Falkland Islands
166         + "|(fm(/.*)?)" // fm  Micronesia, Federal State of
167         + "|(fo(/.*)?)" // fo  Faroe Islands
168         + "|(fr(/.*)?)" // fr  France
169         + "|(ga(/.*)?)" // ga  Gabon
170         + "|(gd(/.*)?)" // gd  Grenada
171         + "|(ge(/.*)?)" // ge  Georgia
172         + "|(gf(/.*)?)" // gf  French Guiana
173         + "|(gg(/.*)?)" // gg  Guernsey
174         + "|(gh(/.*)?)" // gh  Ghana
175         + "|(gi(/.*)?)" // gi  Gibraltar
176         + "|(gl(/.*)?)" // gl  Greenland
177         + "|(gm(/.*)?)" // gm  Gambia
178         + "|(gn(/.*)?)" // gn  Guinea
179         + "|(gov(/.*)?)" // gov Government (US)
180         + "|(gp(/.*)?)" // gp  Guadeloupe
181         + "|(gq(/.*)?)" // gq  Equatorial Guinea
182         + "|(gr(/.*)?)" // gr  Greece
183         + "|(gs(/.*)?)" // gs  South Georgia and the South Sandwich Islands
184         + "|(gt(/.*)?)" // gt  Guatemala
185         + "|(gu(/.*)?)" // gu  Guam
186         + "|(gw(/.*)?)" // gw  Guinea-Bissau
187         + "|(gy(/.*)?)" // gy  Guyana
188         + "|(hk(/.*)?)" // hk  Hong Kong
189         + "|(hm(/.*)?)" // hm  Heard and McDonald Islands
190         + "|(hn(/.*)?)" // hn  Honduras
191         + "|(hr(/.*)?)" // hr  Croatia/Hrvatska
192         + "|(ht(/.*)?)" // ht  Haiti
193         + "|(hu(/.*)?)" // hu  Hungary
194         + "|(id(/.*)?)" // id  Indonesia
195         + "|(ie(/.*)?)" // ie  Ireland
196         + "|(il(/.*)?)" // il  Israel
197         + "|(im(/.*)?)" // im  Isle of Man
198         + "|(in(/.*)?)" // in  India
199         + "|(info(/.*)?)" // info
200         + "|(int(/.*)?)" // int Int. Organizations
201         + "|(io(/.*)?)" // io  British Indian Ocean Territory
202         + "|(iq(/.*)?)" // iq  Iraq
203         + "|(ir(/.*)?)" // ir  Iran, Islamic Republic of
204         + "|(is(/.*)?)" // is  Iceland
205         + "|(it(/.*)?)" // it  Italy
206         + "|(je(/.*)?)" // je  Jersey
207         + "|(jm(/.*)?)" // jm  Jamaica
208         + "|(jo(/.*)?)" // jo  Jordan
209         + "|(jp(/.*)?)" // jp  Japan
210         + "|(ke(/.*)?)" // ke  Kenya
211         + "|(kg(/.*)?)" // kg  Kyrgyzstan
212         + "|(kh(/.*)?)" // kh  Cambodia
213         + "|(ki(/.*)?)" // ki  Kiribati
214         + "|(km(/.*)?)" // km  Comoros
215         + "|(kn(/.*)?)" // kn  Saint Kitts and Nevis
216         + "|(kp(/.*)?)" // kp  Korea, Democratic People's Republic
217         + "|(kr(/.*)?)" // kr  Korea, Republic of
218         + "|(kw(/.*)?)" // kw  Kuwait
219         + "|(ky(/.*)?)" // ky  Cayman Islands
220         + "|(kz(/.*)?)" // kz  Kazakhstan
221         + "|(la(/.*)?)" // la  Lao People's Democratic Republic
222         + "|(lb(/.*)?)" // lb  Lebanon
223         + "|(lc(/.*)?)" // lc  Saint Lucia
224         + "|(li(/.*)?)" // li  Liechtenstein
225         + "|(lk(/.*)?)" // lk  Sri Lanka
226         + "|(lr(/.*)?)" // lr  Liberia
227         + "|(ls(/.*)?)" // ls  Lesotho
228         + "|(lt(/.*)?)" // lt  Lithuania
229         + "|(lu(/.*)?)" // lu  Luxembourg
230         + "|(lv(/.*)?)" // lv  Latvia
231         + "|(ly(/.*)?)" // ly  Libyan Arab Jamahiriya
232         + "|(ma(/.*)?)" // ma  Morocco
233         + "|(mc(/.*)?)" // mc  Monaco
234         + "|(md(/.*)?)" // md  Moldova, Republic of
235         + "|(mg(/.*)?)" // mg  Madagascar
236         + "|(mh(/.*)?)" // mh  Marshall Islands
237         + "|(mil(/.*)?)" // mil Military (US Dept of Defense)
238         + "|(mk(/.*)?)" // mk  Macedonia, Former Yugoslav Republic
239         + "|(ml(/.*)?)" // ml  Mali
240         + "|(mm(/.*)?)" // mm  Myanmar
241         + "|(mn(/.*)?)" // mn  Mongolia
242         + "|(mo(/.*)?)" // mo  Macau
243         + "|(mp(/.*)?)" // mp  Northern Mariana Islands
244         + "|(mq(/.*)?)" // mq  Martinique
245         + "|(mr(/.*)?)" // mr  Mauritani
246         + "|(ms(/.*)?)" // ms  Montserrat
247         + "|(mt(/.*)?)" // mt  Malta
248         + "|(mu(/.*)?)" // mu  Mauritius
249         + "|(museum(/.*)?)" // museum Museums
250         + "|(mv(/.*)?)" // mv  Maldives
251         + "|(mw(/.*)?)" // mw  Malawi
252         + "|(mx(/.*)?)" // mx  Mexico
253         + "|(my(/.*)?)" // my  Malaysia
254         + "|(mz(/.*)?)" // mz  Mozambique
255         + "|(na(/.*)?)" // na  Namibia
256         + "|(name(/.*)?)" // name Individuals
257         + "|(nc(/.*)?)" // nc  New Caledonia
258         + "|(ne(/.*)?)" // ne  Niger
259         + "|(net(/.*)?)" // net networks
260         + "|(nf(/.*)?)" // nf  Norfolk Island
261         + "|(ng(/.*)?)" // ng  Nigeria
262         + "|(ni(/.*)?)" // ni  Nicaragua
263         + "|(nl(/.*)?)" // nl  Netherlands
264         + "|(no(/.*)?)" // no  Norway
265         + "|(np(/.*)?)" // np  Nepal
266         + "|(nr(/.*)?)" // nr  Nauru
267         + "|(nt(/.*)?)" // nt  Neutral Zone
268         + "|(nu(/.*)?)" // nu  Niue
269         + "|(nz(/.*)?)" // nz  New Zealand
270         + "|(om(/.*)?)" // om  Oman
271         + "|(org(/.*)?)" // org Organization (non-profit)
272         + "|(pa(/.*)?)" // pa  Panama
273         + "|(pe(/.*)?)" // pe  Peru
274         + "|(pf(/.*)?)" // pf  French Polynesia
275         + "|(pg(/.*)?)" // pg  Papua New Guinea
276         + "|(ph(/.*)?)" // ph  Philippines
277         + "|(pk(/.*)?)" // pk  Pakistan
278         + "|(pl(/.*)?)" // pl  Poland
279         + "|(pm(/.*)?)" // pm  St. Pierre and Miquelon
280         + "|(pn(/.*)?)" // pn  Pitcairn Island
281         + "|(pr(/.*)?)" // pr  Puerto Rico
282         + "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians
283         + "|(ps(/.*)?)" // ps  Palestinian Territories
284         + "|(pt(/.*)?)" // pt  Portugal
285         + "|(pw(/.*)?)" // pw  Palau
286         + "|(py(/.*)?)" // py  Paraguay
287         + "|(qa(/.*)?)" // qa  Qatar
288         + "|(re(/.*)?)" // re  Reunion Island
289         + "|(ro(/.*)?)" // ro  Romania
290         + "|(ru(/.*)?)" // ru  Russian Federation
291         + "|(rw(/.*)?)" // rw  Rwanda
292         + "|(sa(/.*)?)" // sa  Saudi Arabia
293         + "|(sb(/.*)?)" // sb  Solomon Islands
294         + "|(sc(/.*)?)" // sc  Seychelles
295         + "|(sd(/.*)?)" // sd  Sudan
296         + "|(se(/.*)?)" // se  Sweden
297         + "|(sg(/.*)?)" // sg  Singapore
298         + "|(sh(/.*)?)" // sh  St. Helena
299         + "|(si(/.*)?)" // si  Slovenia
300         + "|(sj(/.*)?)" // sj  Svalbard and Jan Mayen Islands
301         + "|(sk(/.*)?)" // sk  Slovak Republic
302         + "|(sl(/.*)?)" // sl  Sierra Leone
303         + "|(sm(/.*)?)" // sm  San Marino
304         + "|(sn(/.*)?)" // sn  Senegal
305         + "|(so(/.*)?)" // so  Somalia
306         + "|(sr(/.*)?)" // sr  Suriname
307         + "|(sv(/.*)?)" // sv  El Salvador
308         + "|(st(/.*)?)" // st  Sao Tome and Principe
309         + "|(sy(/.*)?)" // sy  Syrian Arab Republic
310         + "|(sz(/.*)?)" // sz  Swaziland
311         + "|(tc(/.*)?)" // tc  Turks and Caicos Islands
312         + "|(td(/.*)?)" // td  Chad
313         + "|(tf(/.*)?)" // tf  French Southern Territories
314         + "|(tg(/.*)?)" // tg  Togo
315         + "|(th(/.*)?)" // th  Thailand
316         + "|(tj(/.*)?)" // tj  Tajikistan
317         + "|(tk(/.*)?)" // tk  Tokelau
318         + "|(tm(/.*)?)" // tm  Turkmenistan
319         + "|(tn(/.*)?)" // tn  Tunisia
320         + "|(to(/.*)?)" // to  Tonga
321         + "|(tp(/.*)?)" // tp  East Timor
322         + "|(tr(/.*)?)" // tr  Turkey
323         + "|(tt(/.*)?)" // tt  Trinidad and Tobago
324         + "|(tv(/.*)?)" // tv  Tuvalu
325         + "|(tw(/.*)?)" // tw  Taiwan
326         + "|(tz(/.*)?)" // tz  Tanzania
327         + "|(ua(/.*)?)" // ua  Ukraine
328         + "|(ug(/.*)?)" // ug  Uganda
329         + "|(uk(/.*)?)" // uk  United Kingdom
330         + "|(um(/.*)?)" // um  US Minor Outlying Islands
331         + "|(us(/.*)?)" // us  United States
332         + "|(uy(/.*)?)" // uy  Uruguay
333         + "|(uz(/.*)?)" // uz  Uzbekistan
334         + "|(va(/.*)?)" // va  Holy See (City Vatican State)
335         + "|(vc(/.*)?)" // vc  Saint Vincent and the Grenadines
336         + "|(ve(/.*)?)" // ve  Venezuela
337         + "|(vg(/.*)?)" // vg  Virgin Islands (British)
338         + "|(vi(/.*)?)" // vi  Virgin Islands (USA)
339         + "|(vn(/.*)?)" // vn  Vietnam
340         + "|(vu(/.*)?)" // vu  Vanuatu
341         + "|(wf(/.*)?)" // wf  Wallis and Futuna Islands
342         + "|(ws(/.*)?)" // ws  Western Samoa
343         + "|(ye(/.*)?)" // ye  Yemen
344         + "|(yt(/.*)?)" // yt  Mayotte
345         + "|(yu(/.*)?)" // yu  Yugoslavia
346         + "|(za(/.*)?)" // za  South Africa
347         + "|(zm(/.*)?)" // zm  Zambia
348         + "|(zw(/.*)?)" // zw  Zimbabwe
349         ;
350 
351     protected long numberOfCURIsHandled = 0;
352     protected long numberOfLinksExtracted= 0;
353 
354     /**
355      * Constructor
356      * @param name The name of the module.
357      */
358     public ExtractorUniversal(String   name) {
359         super(name, "Link extraction on unknown file types. A best effort" +
360                 " extractor that looks at the raw byte code of any file " +
361                 "that has not been handled by another extractor and tries" +
362                 " to find URIs. Will only match absolute URIs.");
363         Type e;
364         e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES,
365             "How deep to look into files for URI strings, in bytes",
366             new Long  (DEFAULT_MAX_DEPTH_BYTES)));
367         e.setExpertSetting(true);
368         e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,
369             "Max length of URIs in bytes", new Long  (DEFAULT_MAX_URL_LENGTH)));
370         e.setExpertSetting(true);
371     }
372 
373     protected void extract(CrawlURI curi) {
374         if (!isHttpTransactionContentToProcess(curi)) {
375             return;
376         }
377 
378         numberOfCURIsHandled++;
379 
380         try {
381             InputStream   instream = curi.getHttpRecorder().getRecordedInput().
382                 getContentReplayInputStream();
383             int ch = instream.read();
384             StringBuffer   lookat = new StringBuffer  ();
385             long counter = 0;
386             long maxdepth = ((Long  )getAttribute(ATTR_MAX_DEPTH_BYTES,curi)).
387                 longValue();
388             if(maxdepth<=0){
389                 maxdepth = Long.MAX_VALUE;
390             }
391             long maxURLLength = ((Long  )getAttribute(ATTR_MAX_URL_LENGTH,curi)).
392                 longValue();
393             boolean foundDot = false;
394             while(ch != -1 && ++counter <= maxdepth) {
395                 if(lookat.length()>maxURLLength){
396                     //Exceeded maximum length of a URL. Start fresh.
397                     lookat = new StringBuffer  ();
398                     foundDot = false;
399                 }
400                 else if(isURLableChar(ch)){
401                     //Add to buffer.
402                     if(ch == 46){
403                         // Current character is a dot '.'
404                         foundDot = true;
405                     }
406                     lookat.append((char)ch);
407                 } else if(lookat.length() > 3 && foundDot) {
408                     // It takes a bare mininum of 4 characters to form a URL
409                     // Since we have at least that many let's try link
410                     // extraction.
411                     String   newURL = lookat.toString();
412                     if(looksLikeAnURL(newURL))
413                     {
414                         // Looks like we found something.
415 
416                         // Let's start with a little cleanup as we may have
417                         // junk in front or at the end.
418                         if(newURL.toLowerCase().indexOf("http") > 0){
419                             // Got garbage in front of the protocol. Remove.
420                             newURL = newURL.substring(newURL.toLowerCase().
421                                 indexOf("http"));
422                         }
423                         while(newURL.substring(newURL.length()-1).equals("."))
424                         {
425                             // URLs can't end with a dot. Strip it off.
426                             newURL = newURL.substring(0,newURL.length()-1);
427                         }
428 
429                         // And add the URL to speculative embeds.
430                         numberOfLinksExtracted++;
431                         curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP);
432                     }
433                     // Reset lookat for next string.
434                     lookat = new StringBuffer  ();
435                     foundDot = false;
436                 } else if(lookat.length()>0) {
437                     // Didn't get enough chars. Reset lookat for next string.
438                     lookat = new StringBuffer  ();
439                     foundDot = false;
440                 }
441                 ch = instream.read();
442             }
443         } catch(IOException   e){
444             //TODO: Handle this exception.
445             e.printStackTrace();
446         } catch (AttributeNotFoundException   e) {
447             // TODO Auto-generated catch block
448             e.printStackTrace();
449         }
450         // Set flag to indicate that link extraction is completed.
451         curi.linkExtractorFinished();
452     }
453 
454     /**
455      * This method takes a look at a string and determines if it could be a URL.
456      * To qualify the string must either begin with "http://" (https would also
457      * work) followed by something that looks like an IP address or contain
458      * within the string (possible at the end but not at the beginning) a TLD
459      * (Top Level Domain) preceded by a dot.
460      *
461      * @param lookat The string to examine in an effort to determine if it
462      * could be a URL
463      * @return True if the string matches the above criteria for a URL.
464      */
465     private boolean looksLikeAnURL(String   lookat) {
466         if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){
467             //Check if the rest of the string looks like an IP address.
468             //if so return true. Otherwise continue on.
469             Matcher   ip = TextUtils.getMatcher(IP_ADDRESS, lookat);
470             boolean testVal = ip.matches();
471             TextUtils.recycleMatcher(ip);
472             if(testVal){
473                 return true;
474             }
475         }
476 
477         int dot = lookat.indexOf(".");
478         if(dot!=0){//An URL can't start with a .tld.
479             while(dot != -1 && dot < lookat.length()){
480                 lookat = lookat.substring(dot+1);
481                 if (isTLD(lookat.substring(0, lookat.length() <= 6?
482                     lookat.length(): 6)))
483                 {
484                     return true;
485                 }
486                 dot = lookat.indexOf(".");
487             }
488         }
489 
490         return false;
491     }
492 
493     /**
494      * Checks if a string is equal to known Top Level Domain. The string may
495      * contain additional characters <i>after</i> the TLD but not before.
496      * @param potentialTLD The string (usually 2-6 chars) to check if it starts
497      * with a TLD.
498      * @return True if the given string starts with the name of a known TLD
499      *
500      * @see #TLDs
501      */
502     private boolean isTLD(String   potentialTLD) {
503         if(potentialTLD.length()<2){
504             return false;
505         }
506 
507         potentialTLD.toLowerCase();
508         Matcher   uri = TextUtils.getMatcher(TLDs, potentialTLD);
509         boolean ret = uri.matches();
510         TextUtils.recycleMatcher(uri);
511         return ret;
512     }
513 
514     /**
515      * Determines if a char (as represented by an int in the range of 0-255) is
516      * a character (in the Ansi character set) that can be present in a URL.
517      * This method takes a <b>strict</b> approach to what characters can be in
518      * a URL.
519      * <p>
520      * The following are considered to be 'URLable'<br>
521      * <ul>
522      *  <li> <code># $ % & + , - . /</code> values 35-38,43-47
523      *  <li> <code>[0-9]</code> values 48-57
524      *  <li> <code>: ; = ? @</code> value 58-59,61,63-64
525      *  <li> <code>[A-Z]</code> values 65-90
526      *  <li> <code>_</code> value 95
527      *  <li> <code>[a-z]</code> values 97-122
528      *  <li> <code>~</code> value 126
529      * </ul>
530      * <p>
531      * To summerize, the following ranges are considered URLable:<br>
532      * 35-38,43-59,61,63-90,95,97-122,126
533      *
534      * @param ch The character (represented by an int) to test.
535      * @return True if it is a URLable character, false otherwise.
536      */
537     private boolean isURLableChar(int ch) {
538         return (ch>=35 && ch<=38)
539             || (ch>=43 && ch<=59)
540             || (ch==61)
541             || (ch>=63 && ch<=90)
542             || (ch==95)
543             || (ch>=97 && ch<=122)
544             || (ch==126);
545     }
546 
547     /* (non-Javadoc)
548      * @see org.archive.crawler.framework.Processor#report()
549      */
550     public String   report() {
551         StringBuffer   ret = new StringBuffer  ();
552         ret.append("Processor: org.archive.crawler.extractor." +
553             "ExtractorUniversal\n");
554         ret.append("  Function:          Link extraction on unknown file" +
555             " types.\n");
556         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
557         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
558 
559         return ret.toString();
560     }
561 }
562
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags