KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorUniversal


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Jan 15, 2004
20  *
21  */

22 package org.archive.crawler.extractor;
23
24 import java.io.IOException JavaDoc;
25 import java.io.InputStream JavaDoc;
26 import java.util.regex.Matcher JavaDoc;
27
28 import javax.management.AttributeNotFoundException JavaDoc;
29
30 import org.archive.crawler.datamodel.CoreAttributeConstants;
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.crawler.settings.SimpleType;
33 import org.archive.crawler.settings.Type;
34 import org.archive.net.UURI;
35 import org.archive.util.TextUtils;
36
37 /**
38  * A last ditch extractor that will look at the raw byte code and try to extract
39  * anything that <i>looks</i> like a link.
40  *
41  * If used, it should always be specified as the last link extractor in the
42  * order file.
43  * <p>
44  * To accomplish this it will scan through the bytecode and try and build up
45  * strings of consecutive bytes that all represent characters that are valid
46  * in a URL (see #isURLableChar(int) for details).
47  * Once it hits the end of such a string (i.e. finds a character that
48  * should not be in a URL) it will try to determine if it has found a URL.
49  * This is done be seeing if the string is an IP address prefixed with
50  * http(s):// or contains a dot followed by a Top Level Domain and end of
51  * string or a slash.
52  *
53  * @author Kristinn Sigurdsson
54  */

55 public class ExtractorUniversal extends Extractor
56 implements CoreAttributeConstants {
57
58     private static final long serialVersionUID = -7593380118857156939L;
59
60 // private static final Logger logger =
61
// Logger.getLogger(ExtractorUniversal.class.getName());
62

63     private static String JavaDoc ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
64
65     /** Default value for how far into an unknown document we should scan
66      * - 10k. A value of 0 or lower will disable this.
67      */

68     private static long DEFAULT_MAX_DEPTH_BYTES = 10240;
69
70     private static String JavaDoc ATTR_MAX_URL_LENGTH = "max-url-length";
71
72     /** Maximum length for a URI that we try to match.*/
73     private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;
74
75     /**
76      * Matches any string that begins with http:// or https:// followed by
77      * something that looks like an ip address (four numbers, none longer then
78      * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are
79      * each in the range 0-255.
80      */

81     static final String JavaDoc IP_ADDRESS =
82         "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)";
83
84     /**
85      * Matches any string that begins with a TLD (no .) followed by a '/' slash
86      * or end of string. If followed by slash then nothing after the slash is
87      * of consequence.
88      */

89     public static final String JavaDoc TLDs =
90           "(ac(/.*)?)" // ac Ascension Island
91
+ "|(ad(/.*)?)" // ad Andorra
92
+ "|(ae(/.*)?)" // ae United Arab Emirates
93
+ "|(af(/.*)?)" // af Afghanistan
94
+ "|(ag(/.*)?)" // ag Antigua and Barbuda
95
+ "|(ai(/.*)?)" // ai Anguilla
96
+ "|(al(/.*)?)" // al Albania
97
+ "|(am(/.*)?)" // am Armenia
98
+ "|(an(/.*)?)" // an Netherlands Antilles
99
+ "|(ao(/.*)?)" // ao Angola
100
+ "|(aero(/.*)?)" // aero Air-transport industry
101
+ "|(aq(/.*)?)" // aq Antarctica
102
+ "|(ar(/.*)?)" // ar Argentina
103
+ "|(as(/.*)?)" // as American Samoa
104
+ "|(at(/.*)?)" // at Austria
105
+ "|(au(/.*)?)" // au Australia
106
+ "|(aw(/.*)?)" // aw Aruba
107
+ "|(az(/.*)?)" // az Azerbaijan
108
+ "|(ba(/.*)?)" // ba Bosnia Hercegovina
109
+ "|(bb(/.*)?)" // bb Barbados
110
+ "|(bd(/.*)?)" // bd Bangladesh
111
+ "|(be(/.*)?)" // be Belgium
112
+ "|(bf(/.*)?)" // bf Burkina Faso
113
+ "|(bg(/.*)?)" // bg Bulgaria
114
+ "|(bh(/.*)?)" // bh Bahrain
115
+ "|(bi(/.*)?)" // bi Burundi
116
+ "|(biz(/.*)?)" // biz Businesses
117
+ "|(bj(/.*)?)" // bj Benin
118
+ "|(bm(/.*)?)" // bm Bermuda
119
+ "|(bn(/.*)?)" // bn Brunei Darussalam
120
+ "|(bo(/.*)?)" // bo Bolivia
121
+ "|(br(/.*)?)" // br Brazil
122
+ "|(bs(/.*)?)" // bs Bahamas
123
+ "|(bt(/.*)?)" // bt Bhutan
124
+ "|(bv(/.*)?)" // bv Bouvet Island
125
+ "|(bw(/.*)?)" // bw Botswana
126
+ "|(by(/.*)?)" // by Belarus (Byelorussia)
127
+ "|(bz(/.*)?)" // bz Belize
128
+ "|(ca(/.*)?)" // ca Canada
129
+ "|(cc(/.*)?)" // cc Cocos Islands (Keeling)
130
+ "|(cd(/.*)?)" // cd Congo, Democratic Republic of the
131
+ "|(cf(/.*)?)" // cf Central African Republic
132
+ "|(cg(/.*)?)" // cg Congo, Republic of
133
+ "|(ch(/.*)?)" // ch Switzerland
134
+ "|(ci(/.*)?)" // ci Cote d'Ivoire (Ivory Coast)
135
+ "|(ck(/.*)?)" // ck Cook Islands
136
+ "|(cl(/.*)?)" // cl Chile
137
+ "|(cm(/.*)?)" // cm Cameroon
138
+ "|(cn(/.*)?)" // cn China
139
+ "|(co(/.*)?)" // co Colombia
140
+ "|(com(/.*)?)" // com Commercial
141
+ "|(coop(/.*)?)" // coop Cooperatives
142
+ "|(cr(/.*)?)" // cr Costa Rica
143
+ "|(cs(/.*)?)" // cs Czechoslovakia
144
+ "|(cu(/.*)?)" // cu Cuba
145
+ "|(cv(/.*)?)" // cv Cap Verde
146
+ "|(cx(/.*)?)" // cx Christmas Island
147
+ "|(cy(/.*)?)" // cy Cyprus
148
+ "|(cz(/.*)?)" // cz Czech Republic
149
+ "|(de(/.*)?)" // de Germany
150
+ "|(dj(/.*)?)" // dj Djibouti
151
+ "|(dk(/.*)?)" // dk Denmark
152
+ "|(dm(/.*)?)" // dm Dominica
153
+ "|(do(/.*)?)" // do Dominican Republic
154
+ "|(dz(/.*)?)" // dz Algeria
155
+ "|(ec(/.*)?)" // ec Ecuador
156
+ "|(edu(/.*)?)" // edu Educational Institution
157
+ "|(ee(/.*)?)" // ee Estonia
158
+ "|(eg(/.*)?)" // eg Egypt
159
+ "|(eh(/.*)?)" // eh Western Sahara
160
+ "|(er(/.*)?)" // er Eritrea
161
+ "|(es(/.*)?)" // es Spain
162
+ "|(et(/.*)?)" // et Ethiopia
163
+ "|(fi(/.*)?)" // fi Finland
164
+ "|(fj(/.*)?)" // fj Fiji
165
+ "|(fk(/.*)?)" // fk Falkland Islands
166
+ "|(fm(/.*)?)" // fm Micronesia, Federal State of
167
+ "|(fo(/.*)?)" // fo Faroe Islands
168
+ "|(fr(/.*)?)" // fr France
169
+ "|(ga(/.*)?)" // ga Gabon
170
+ "|(gd(/.*)?)" // gd Grenada
171
+ "|(ge(/.*)?)" // ge Georgia
172
+ "|(gf(/.*)?)" // gf French Guiana
173
+ "|(gg(/.*)?)" // gg Guernsey
174
+ "|(gh(/.*)?)" // gh Ghana
175
+ "|(gi(/.*)?)" // gi Gibraltar
176
+ "|(gl(/.*)?)" // gl Greenland
177
+ "|(gm(/.*)?)" // gm Gambia
178
+ "|(gn(/.*)?)" // gn Guinea
179
+ "|(gov(/.*)?)" // gov Government (US)
180
+ "|(gp(/.*)?)" // gp Guadeloupe
181
+ "|(gq(/.*)?)" // gq Equatorial Guinea
182
+ "|(gr(/.*)?)" // gr Greece
183
+ "|(gs(/.*)?)" // gs South Georgia and the South Sandwich Islands
184
+ "|(gt(/.*)?)" // gt Guatemala
185
+ "|(gu(/.*)?)" // gu Guam
186
+ "|(gw(/.*)?)" // gw Guinea-Bissau
187
+ "|(gy(/.*)?)" // gy Guyana
188
+ "|(hk(/.*)?)" // hk Hong Kong
189
+ "|(hm(/.*)?)" // hm Heard and McDonald Islands
190
+ "|(hn(/.*)?)" // hn Honduras
191
+ "|(hr(/.*)?)" // hr Croatia/Hrvatska
192
+ "|(ht(/.*)?)" // ht Haiti
193
+ "|(hu(/.*)?)" // hu Hungary
194
+ "|(id(/.*)?)" // id Indonesia
195
+ "|(ie(/.*)?)" // ie Ireland
196
+ "|(il(/.*)?)" // il Israel
197
+ "|(im(/.*)?)" // im Isle of Man
198
+ "|(in(/.*)?)" // in India
199
+ "|(info(/.*)?)" // info
200
+ "|(int(/.*)?)" // int Int. Organizations
201
+ "|(io(/.*)?)" // io British Indian Ocean Territory
202
+ "|(iq(/.*)?)" // iq Iraq
203
+ "|(ir(/.*)?)" // ir Iran, Islamic Republic of
204
+ "|(is(/.*)?)" // is Iceland
205
+ "|(it(/.*)?)" // it Italy
206
+ "|(je(/.*)?)" // je Jersey
207
+ "|(jm(/.*)?)" // jm Jamaica
208
+ "|(jo(/.*)?)" // jo Jordan
209
+ "|(jp(/.*)?)" // jp Japan
210
+ "|(ke(/.*)?)" // ke Kenya
211
+ "|(kg(/.*)?)" // kg Kyrgyzstan
212
+ "|(kh(/.*)?)" // kh Cambodia
213
+ "|(ki(/.*)?)" // ki Kiribati
214
+ "|(km(/.*)?)" // km Comoros
215
+ "|(kn(/.*)?)" // kn Saint Kitts and Nevis
216
+ "|(kp(/.*)?)" // kp Korea, Democratic People's Republic
217
+ "|(kr(/.*)?)" // kr Korea, Republic of
218
+ "|(kw(/.*)?)" // kw Kuwait
219
+ "|(ky(/.*)?)" // ky Cayman Islands
220
+ "|(kz(/.*)?)" // kz Kazakhstan
221
+ "|(la(/.*)?)" // la Lao People's Democratic Republic
222
+ "|(lb(/.*)?)" // lb Lebanon
223
+ "|(lc(/.*)?)" // lc Saint Lucia
224
+ "|(li(/.*)?)" // li Liechtenstein
225
+ "|(lk(/.*)?)" // lk Sri Lanka
226
+ "|(lr(/.*)?)" // lr Liberia
227
+ "|(ls(/.*)?)" // ls Lesotho
228
+ "|(lt(/.*)?)" // lt Lithuania
229
+ "|(lu(/.*)?)" // lu Luxembourg
230
+ "|(lv(/.*)?)" // lv Latvia
231
+ "|(ly(/.*)?)" // ly Libyan Arab Jamahiriya
232
+ "|(ma(/.*)?)" // ma Morocco
233
+ "|(mc(/.*)?)" // mc Monaco
234
+ "|(md(/.*)?)" // md Moldova, Republic of
235
+ "|(mg(/.*)?)" // mg Madagascar
236
+ "|(mh(/.*)?)" // mh Marshall Islands
237
+ "|(mil(/.*)?)" // mil Military (US Dept of Defense)
238
+ "|(mk(/.*)?)" // mk Macedonia, Former Yugoslav Republic
239
+ "|(ml(/.*)?)" // ml Mali
240
+ "|(mm(/.*)?)" // mm Myanmar
241
+ "|(mn(/.*)?)" // mn Mongolia
242
+ "|(mo(/.*)?)" // mo Macau
243
+ "|(mp(/.*)?)" // mp Northern Mariana Islands
244
+ "|(mq(/.*)?)" // mq Martinique
245
+ "|(mr(/.*)?)" // mr Mauritani
246
+ "|(ms(/.*)?)" // ms Montserrat
247
+ "|(mt(/.*)?)" // mt Malta
248
+ "|(mu(/.*)?)" // mu Mauritius
249
+ "|(museum(/.*)?)" // museum Museums
250
+ "|(mv(/.*)?)" // mv Maldives
251
+ "|(mw(/.*)?)" // mw Malawi
252
+ "|(mx(/.*)?)" // mx Mexico
253
+ "|(my(/.*)?)" // my Malaysia
254
+ "|(mz(/.*)?)" // mz Mozambique
255
+ "|(na(/.*)?)" // na Namibia
256
+ "|(name(/.*)?)" // name Individuals
257
+ "|(nc(/.*)?)" // nc New Caledonia
258
+ "|(ne(/.*)?)" // ne Niger
259
+ "|(net(/.*)?)" // net networks
260
+ "|(nf(/.*)?)" // nf Norfolk Island
261
+ "|(ng(/.*)?)" // ng Nigeria
262
+ "|(ni(/.*)?)" // ni Nicaragua
263
+ "|(nl(/.*)?)" // nl Netherlands
264
+ "|(no(/.*)?)" // no Norway
265
+ "|(np(/.*)?)" // np Nepal
266
+ "|(nr(/.*)?)" // nr Nauru
267
+ "|(nt(/.*)?)" // nt Neutral Zone
268
+ "|(nu(/.*)?)" // nu Niue
269
+ "|(nz(/.*)?)" // nz New Zealand
270
+ "|(om(/.*)?)" // om Oman
271
+ "|(org(/.*)?)" // org Organization (non-profit)
272
+ "|(pa(/.*)?)" // pa Panama
273
+ "|(pe(/.*)?)" // pe Peru
274
+ "|(pf(/.*)?)" // pf French Polynesia
275
+ "|(pg(/.*)?)" // pg Papua New Guinea
276
+ "|(ph(/.*)?)" // ph Philippines
277
+ "|(pk(/.*)?)" // pk Pakistan
278
+ "|(pl(/.*)?)" // pl Poland
279
+ "|(pm(/.*)?)" // pm St. Pierre and Miquelon
280
+ "|(pn(/.*)?)" // pn Pitcairn Island
281
+ "|(pr(/.*)?)" // pr Puerto Rico
282
+ "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians
283
+ "|(ps(/.*)?)" // ps Palestinian Territories
284
+ "|(pt(/.*)?)" // pt Portugal
285
+ "|(pw(/.*)?)" // pw Palau
286
+ "|(py(/.*)?)" // py Paraguay
287
+ "|(qa(/.*)?)" // qa Qatar
288
+ "|(re(/.*)?)" // re Reunion Island
289
+ "|(ro(/.*)?)" // ro Romania
290
+ "|(ru(/.*)?)" // ru Russian Federation
291
+ "|(rw(/.*)?)" // rw Rwanda
292
+ "|(sa(/.*)?)" // sa Saudi Arabia
293
+ "|(sb(/.*)?)" // sb Solomon Islands
294
+ "|(sc(/.*)?)" // sc Seychelles
295
+ "|(sd(/.*)?)" // sd Sudan
296
+ "|(se(/.*)?)" // se Sweden
297
+ "|(sg(/.*)?)" // sg Singapore
298
+ "|(sh(/.*)?)" // sh St. Helena
299
+ "|(si(/.*)?)" // si Slovenia
300
+ "|(sj(/.*)?)" // sj Svalbard and Jan Mayen Islands
301
+ "|(sk(/.*)?)" // sk Slovak Republic
302
+ "|(sl(/.*)?)" // sl Sierra Leone
303
+ "|(sm(/.*)?)" // sm San Marino
304
+ "|(sn(/.*)?)" // sn Senegal
305
+ "|(so(/.*)?)" // so Somalia
306
+ "|(sr(/.*)?)" // sr Suriname
307
+ "|(sv(/.*)?)" // sv El Salvador
308
+ "|(st(/.*)?)" // st Sao Tome and Principe
309
+ "|(sy(/.*)?)" // sy Syrian Arab Republic
310
+ "|(sz(/.*)?)" // sz Swaziland
311
+ "|(tc(/.*)?)" // tc Turks and Caicos Islands
312
+ "|(td(/.*)?)" // td Chad
313
+ "|(tf(/.*)?)" // tf French Southern Territories
314
+ "|(tg(/.*)?)" // tg Togo
315
+ "|(th(/.*)?)" // th Thailand
316
+ "|(tj(/.*)?)" // tj Tajikistan
317
+ "|(tk(/.*)?)" // tk Tokelau
318
+ "|(tm(/.*)?)" // tm Turkmenistan
319
+ "|(tn(/.*)?)" // tn Tunisia
320
+ "|(to(/.*)?)" // to Tonga
321
+ "|(tp(/.*)?)" // tp East Timor
322
+ "|(tr(/.*)?)" // tr Turkey
323
+ "|(tt(/.*)?)" // tt Trinidad and Tobago
324
+ "|(tv(/.*)?)" // tv Tuvalu
325
+ "|(tw(/.*)?)" // tw Taiwan
326
+ "|(tz(/.*)?)" // tz Tanzania
327
+ "|(ua(/.*)?)" // ua Ukraine
328
+ "|(ug(/.*)?)" // ug Uganda
329
+ "|(uk(/.*)?)" // uk United Kingdom
330
+ "|(um(/.*)?)" // um US Minor Outlying Islands
331
+ "|(us(/.*)?)" // us United States
332
+ "|(uy(/.*)?)" // uy Uruguay
333
+ "|(uz(/.*)?)" // uz Uzbekistan
334
+ "|(va(/.*)?)" // va Holy See (City Vatican State)
335
+ "|(vc(/.*)?)" // vc Saint Vincent and the Grenadines
336
+ "|(ve(/.*)?)" // ve Venezuela
337
+ "|(vg(/.*)?)" // vg Virgin Islands (British)
338
+ "|(vi(/.*)?)" // vi Virgin Islands (USA)
339
+ "|(vn(/.*)?)" // vn Vietnam
340
+ "|(vu(/.*)?)" // vu Vanuatu
341
+ "|(wf(/.*)?)" // wf Wallis and Futuna Islands
342
+ "|(ws(/.*)?)" // ws Western Samoa
343
+ "|(ye(/.*)?)" // ye Yemen
344
+ "|(yt(/.*)?)" // yt Mayotte
345
+ "|(yu(/.*)?)" // yu Yugoslavia
346
+ "|(za(/.*)?)" // za South Africa
347
+ "|(zm(/.*)?)" // zm Zambia
348
+ "|(zw(/.*)?)" // zw Zimbabwe
349
;
350
351     protected long numberOfCURIsHandled = 0;
352     protected long numberOfLinksExtracted= 0;
353
354     /**
355      * Constructor
356      * @param name The name of the module.
357      */

358     public ExtractorUniversal(String JavaDoc name) {
359         super(name, "Link extraction on unknown file types. A best effort" +
360                 " extractor that looks at the raw byte code of any file " +
361                 "that has not been handled by another extractor and tries" +
362                 " to find URIs. Will only match absolute URIs.");
363         Type e;
364         e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES,
365             "How deep to look into files for URI strings, in bytes",
366             new Long JavaDoc(DEFAULT_MAX_DEPTH_BYTES)));
367         e.setExpertSetting(true);
368         e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,
369             "Max length of URIs in bytes", new Long JavaDoc(DEFAULT_MAX_URL_LENGTH)));
370         e.setExpertSetting(true);
371     }
372
373     protected void extract(CrawlURI curi) {
374         if (!isHttpTransactionContentToProcess(curi)) {
375             return;
376         }
377
378         numberOfCURIsHandled++;
379
380         try {
381             InputStream JavaDoc instream = curi.getHttpRecorder().getRecordedInput().
382                 getContentReplayInputStream();
383             int ch = instream.read();
384             StringBuffer JavaDoc lookat = new StringBuffer JavaDoc();
385             long counter = 0;
386             long maxdepth = ((Long JavaDoc)getAttribute(ATTR_MAX_DEPTH_BYTES,curi)).
387                 longValue();
388             if(maxdepth<=0){
389                 maxdepth = Long.MAX_VALUE;
390             }
391             long maxURLLength = ((Long JavaDoc)getAttribute(ATTR_MAX_URL_LENGTH,curi)).
392                 longValue();
393             boolean foundDot = false;
394             while(ch != -1 && ++counter <= maxdepth) {
395                 if(lookat.length()>maxURLLength){
396                     //Exceeded maximum length of a URL. Start fresh.
397
lookat = new StringBuffer JavaDoc();
398                     foundDot = false;
399                 }
400                 else if(isURLableChar(ch)){
401                     //Add to buffer.
402
if(ch == 46){
403                         // Current character is a dot '.'
404
foundDot = true;
405                     }
406                     lookat.append((char)ch);
407                 } else if(lookat.length() > 3 && foundDot) {
408                     // It takes a bare mininum of 4 characters to form a URL
409
// Since we have at least that many let's try link
410
// extraction.
411
String JavaDoc newURL = lookat.toString();
412                     if(looksLikeAnURL(newURL))
413                     {
414                         // Looks like we found something.
415

416                         // Let's start with a little cleanup as we may have
417
// junk in front or at the end.
418
if(newURL.toLowerCase().indexOf("http") > 0){
419                             // Got garbage in front of the protocol. Remove.
420
newURL = newURL.substring(newURL.toLowerCase().
421                                 indexOf("http"));
422                         }
423                         while(newURL.substring(newURL.length()-1).equals("."))
424                         {
425                             // URLs can't end with a dot. Strip it off.
426
newURL = newURL.substring(0,newURL.length()-1);
427                         }
428
429                         // And add the URL to speculative embeds.
430
numberOfLinksExtracted++;
431                         curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP);
432                     }
433                     // Reset lookat for next string.
434
lookat = new StringBuffer JavaDoc();
435                     foundDot = false;
436                 } else if(lookat.length()>0) {
437                     // Didn't get enough chars. Reset lookat for next string.
438
lookat = new StringBuffer JavaDoc();
439                     foundDot = false;
440                 }
441                 ch = instream.read();
442             }
443         } catch(IOException JavaDoc e){
444             //TODO: Handle this exception.
445
e.printStackTrace();
446         } catch (AttributeNotFoundException JavaDoc e) {
447             // TODO Auto-generated catch block
448
e.printStackTrace();
449         }
450         // Set flag to indicate that link extraction is completed.
451
curi.linkExtractorFinished();
452     }
453
454     /**
455      * This method takes a look at a string and determines if it could be a URL.
456      * To qualify the string must either begin with "http://" (https would also
457      * work) followed by something that looks like an IP address or contain
458      * within the string (possible at the end but not at the beginning) a TLD
459      * (Top Level Domain) preceded by a dot.
460      *
461      * @param lookat The string to examine in an effort to determine if it
462      * could be a URL
463      * @return True if the string matches the above criteria for a URL.
464      */

465     private boolean looksLikeAnURL(String JavaDoc lookat) {
466         if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){
467             //Check if the rest of the string looks like an IP address.
468
//if so return true. Otherwise continue on.
469
Matcher JavaDoc ip = TextUtils.getMatcher(IP_ADDRESS, lookat);
470             boolean testVal = ip.matches();
471             TextUtils.recycleMatcher(ip);
472             if(testVal){
473                 return true;
474             }
475         }
476
477         int dot = lookat.indexOf(".");
478         if(dot!=0){//An URL can't start with a .tld.
479
while(dot != -1 && dot < lookat.length()){
480                 lookat = lookat.substring(dot+1);
481                 if (isTLD(lookat.substring(0, lookat.length() <= 6?
482                     lookat.length(): 6)))
483                 {
484                     return true;
485                 }
486                 dot = lookat.indexOf(".");
487             }
488         }
489
490         return false;
491     }
492
493     /**
494      * Checks if a string is equal to known Top Level Domain. The string may
495      * contain additional characters <i>after</i> the TLD but not before.
496      * @param potentialTLD The string (usually 2-6 chars) to check if it starts
497      * with a TLD.
498      * @return True if the given string starts with the name of a known TLD
499      *
500      * @see #TLDs
501      */

502     private boolean isTLD(String JavaDoc potentialTLD) {
503         if(potentialTLD.length()<2){
504             return false;
505         }
506
507         potentialTLD.toLowerCase();
508         Matcher JavaDoc uri = TextUtils.getMatcher(TLDs, potentialTLD);
509         boolean ret = uri.matches();
510         TextUtils.recycleMatcher(uri);
511         return ret;
512     }
513
514     /**
515      * Determines if a char (as represented by an int in the range of 0-255) is
516      * a character (in the Ansi character set) that can be present in a URL.
517      * This method takes a <b>strict</b> approach to what characters can be in
518      * a URL.
519      * <p>
520      * The following are considered to be 'URLable'<br>
521      * <ul>
522      * <li> <code># $ % & + , - . /</code> values 35-38,43-47
523      * <li> <code>[0-9]</code> values 48-57
524      * <li> <code>: ; = ? @</code> value 58-59,61,63-64
525      * <li> <code>[A-Z]</code> values 65-90
526      * <li> <code>_</code> value 95
527      * <li> <code>[a-z]</code> values 97-122
528      * <li> <code>~</code> value 126
529      * </ul>
530      * <p>
531      * To summerize, the following ranges are considered URLable:<br>
532      * 35-38,43-59,61,63-90,95,97-122,126
533      *
534      * @param ch The character (represented by an int) to test.
535      * @return True if it is a URLable character, false otherwise.
536      */

537     private boolean isURLableChar(int ch) {
538         return (ch>=35 && ch<=38)
539             || (ch>=43 && ch<=59)
540             || (ch==61)
541             || (ch>=63 && ch<=90)
542             || (ch==95)
543             || (ch>=97 && ch<=122)
544             || (ch==126);
545     }
546
547     /* (non-Javadoc)
548      * @see org.archive.crawler.framework.Processor#report()
549      */

550     public String JavaDoc report() {
551         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
552         ret.append("Processor: org.archive.crawler.extractor." +
553             "ExtractorUniversal\n");
554         ret.append(" Function: Link extraction on unknown file" +
555             " types.\n");
556         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
557         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
558
559         return ret.toString();
560     }
561 }
562
Popular Tags