KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sf > jftp > tools > FileSearch


1 /*
2  * This program is free software; you can redistribute it and/or
3  * modify it under the terms of the GNU General Public License
4  * as published by the Free Software Foundation; either version 2
5  * of the License, or (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  * GNU General Public License for more details.
11
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  */

16 package net.sf.jftp.tools;
17
18 import java.io.BufferedInputStream JavaDoc;
19 import java.io.BufferedOutputStream JavaDoc;
20 import java.io.BufferedReader JavaDoc;
21 import java.io.BufferedWriter JavaDoc;
22 import java.io.DataInputStream JavaDoc;
23 import java.io.File JavaDoc;
24 import java.io.FileOutputStream JavaDoc;
25 import java.io.InputStreamReader JavaDoc;
26 import java.io.OutputStreamWriter JavaDoc;
27 import java.net.Socket JavaDoc;
28 import java.util.Enumeration JavaDoc;
29 import java.util.Hashtable JavaDoc;
30 import java.util.StringTokenizer JavaDoc;
31 import java.util.Vector JavaDoc;
32
33 import net.sf.jftp.system.LocalIO;
34 import net.sf.jftp.system.logging.Log;
35
36
37 public class FileSearch
38 {
39
40     private int currentDepth = 0;
41     private Hashtable JavaDoc checked = new Hashtable JavaDoc();
42     public static boolean quiet = true;
43     public static boolean ultraquiet = false;
44     
45     String JavaDoc localDir = ".";
46     int MAX = 999999;
47     int MIN_TERM = 1;
48     int MIN_FACTOR = 1;
49     boolean LOAD = false;
50     String JavaDoc[] typeArray = { "" };
51     String JavaDoc[] termArray = { "" };
52     String JavaDoc[] optArray = { "" };
53     String JavaDoc[] ignoreArray = { "" };
54     String JavaDoc[] scanArray = { "" };
55     
56
57     public static void main(String JavaDoc argv[]) {
58         String JavaDoc[] typeArray = { ".gz", ".bz2", ".zip", ".rar" };
59         String JavaDoc[] termArray = { "linux", "kernel" };
60         String JavaDoc[] optArray = { "download", "file", "mirror", "location" };
61         String JavaDoc[] ignoreArray = { ".gif", ".jpg", ".png", ".swf", ".jar", ".class", ".google." };
62         String JavaDoc[] scanArray = { ".html", ".htm", "/", ".jsp", ".jhtml", ".phtml", ".asp", ".xml", ".js", ".cgi" };
63         String JavaDoc url = "http://www.google.de/search?hl=de&q=";
64         
65         for(int i=0; i<termArray.length; i++) {
66             url += termArray[i]+"+";
67         }
68         
69         FileSearch search = new FileSearch();
70         
71         search.typeArray = typeArray;
72         search.termArray = termArray;
73         search.optArray = optArray;
74         search.ignoreArray = ignoreArray;
75         search.scanArray = scanArray;
76         search.MIN_TERM = 1;
77         
78         search.spider(url);
79         
80     }
81
82     private void spider(String JavaDoc url)
83     {
84         try
85         {
86             if(url.indexOf("/") < 0)
87             {
88                 url = url + "/";
89             }
90             
91             url = clear(url);
92             
93             Log.out(">>> URL: "+url);
94             Log.out(">>> Scanning for ");
95             
96             for(int i = 0; i < typeArray.length; i++)
97             {
98                 Log.out(typeArray[i] + " ");
99             }
100             
101             Log.out("");
102             
103
104             Log.out("Fetching initial HTML file...");
105
106             Getter urlGetter = new Getter(localDir);
107             urlGetter.fetch(url, true);
108
109             Log.out("Searching for links...");
110             LocalIO.pause(500);
111
112             crawl(url);
113         }
114         catch(Exception JavaDoc ex)
115         {
116             ex.printStackTrace();
117         }
118     }
119
120     private String JavaDoc clear(String JavaDoc url)
121     {
122         int idx = url.indexOf("http://");
123
124         if(idx >= 0)
125         {
126             url = url.substring(7);
127         }
128
129         return url;
130     }
131
132     private Vector JavaDoc addVector(Vector JavaDoc v, Vector JavaDoc x)
133     {
134         Enumeration JavaDoc e = x.elements();
135
136         while(e.hasMoreElements())
137         {
138             String JavaDoc next = (String JavaDoc) e.nextElement();
139             v.add(next);
140         }
141
142         return v;
143     }
144
145     private int rate(String JavaDoc content) {
146         int score = 0;
147         
148         for(int i=0; i<termArray.length; i++) {
149             if(content.indexOf(termArray[i]) >= 0) score += 3;
150         }
151         
152         if(score < MIN_TERM) return 0;
153     
154         for(int i=0; i<optArray.length; i++) {
155             if(content.indexOf(optArray[i]) >= 0) score++;
156         }
157         
158         return score;
159     }
160     
161     private int checkForResult(String JavaDoc url) {
162         //for(int i=0; i<typeArray.length; i++) {
163
// if(url.indexOf(typeArray[i]) >= 0) return 2;
164
//}
165

166         for(int i=0; i<ignoreArray.length; i++) {
167             if(url.indexOf(ignoreArray[i]) >= 0) return -1;
168         }
169         
170         if(!checkForScanableUrl(url)) return -1;
171         
172         return 1;
173     }
174     
175     private boolean checkForScanableUrl(String JavaDoc url) {
176         
177         if(checked.containsKey(url)) {
178             return false;
179         }
180         else {
181             checked.put(url, "");
182         }
183         
184         if(url.indexOf("/") > 0) {
185             String JavaDoc tmp = url.substring(0, url.indexOf("/"));
186         }
187         
188         for(int i=0; i<scanArray.length; i++) {
189             if(url.endsWith(scanArray[i])) return true;
190         }
191
192         return false;
193     }
194     
195     private void crawl(String JavaDoc url) throws Exception JavaDoc
196     {
197         url = clear(url);
198
199         int urlRating = checkForResult(url);
200         if(!quiet) Log.out("URL-Rating: "+url+" -> "+urlRating+" @"+currentDepth);
201         
202         if(urlRating > 0) {
203             //System.out.println("!!!");
204
//Getter.chill(1000);
205
//System.exit(0);
206
} else if(urlRating < 0 && currentDepth > 0) {
207             if(!quiet) Log.out("SKIP "+url);
208             return;
209         }
210
211
212         Getter urlGetter = new Getter(localDir);
213         String JavaDoc content = urlGetter.fetch(url);
214         
215         int factor = rate(content);
216         if(!quiet) Log.out("Content-Rating: "+url+" -> "+factor+" @"+currentDepth);
217         
218         if(factor < MIN_FACTOR) {
219             if(!quiet) Log.out("DROP: "+url);
220             return;
221         }
222         
223         if(!ultraquiet) Log.out("Url: "+url+" -> "+urlRating+":"+factor+"@"+currentDepth);
224
225         Vector JavaDoc m = sort(content, url.substring(0, url.lastIndexOf("/")),
226                               "href=\"");
227         m = addVector(m,
228                       sort(content, url.substring(0, url.lastIndexOf("/")),
229                                  "src=\""));
230         m = addVector(m,
231                       sort(content, url.substring(0, url.lastIndexOf("/")),
232                                  "HREF=\""));
233         m = addVector(m,
234                       sort(content, url.substring(0, url.lastIndexOf("/")),
235                                  "SRC=\""));
236
237         Enumeration JavaDoc links = m.elements();
238
239         while(links.hasMoreElements())
240         {
241
242             String JavaDoc next = (String JavaDoc) links.nextElement();
243             
244             if(!quiet) Log.out("PROCESS: " + next);
245             boolean skip = false;
246             
247             while(!skip) {
248                 for(int i = 0; i < typeArray.length; i++)
249                 {
250                     if(next.endsWith(typeArray[i]) ||
251                             typeArray[i].trim().equals("*"))
252                     {
253                         Log.out("HIT: "+url+" -> "+next);
254                         //Getter.chill(2000);
255

256                         if(!LOAD || !checkForScanableUrl(url)) continue;
257                         
258                         int x = next.indexOf("/");
259                         
260                         if((x > 0) && (next.substring(0, x).indexOf(".") > 0))
261                         {
262                             Getter urlGetter2 = new Getter(localDir);
263                             urlGetter2.fetch(next, false);
264                             
265                             continue;
266                         }
267                     }
268                 }
269                 
270                 skip = true;
271             }
272
273             if(currentDepth < MAX)
274             {
275
276                 int x = next.indexOf("/");
277
278                 if((x > 0) && (next.substring(0, x).indexOf(".") > 0))
279                 {
280                     currentDepth++;
281                     crawl(next);
282                     currentDepth--;
283                 }
284             }
285         }
286     }
287
288     private Vector JavaDoc sort(String JavaDoc content, String JavaDoc url, String JavaDoc index)
289     {
290         Vector JavaDoc res = new Vector JavaDoc();
291         int wo = 0;
292
293         while(true)
294         {
295             wo = content.indexOf(index);
296
297             if(wo < 0)
298             {
299                 return res;
300             }
301
302             content = content.substring(wo + index.length());
303
304             String JavaDoc was = content.substring(0, content.indexOf("\""));
305
306             was = createAbsoluteUrl(was, url);
307             res.add(was);
308             if(!quiet) Log.out("ADD: " + was);
309         }
310     }
311
312     private String JavaDoc[] check(String JavaDoc auswahl)
313     {
314         StringTokenizer JavaDoc tokenizer = new StringTokenizer JavaDoc(auswahl, "-", false);
315         String JavaDoc[] strArr = new String JavaDoc[tokenizer.countTokens()];
316         int tmp = 0;
317
318         while(tokenizer.hasMoreElements())
319         {
320             strArr[tmp] = (String JavaDoc) tokenizer.nextElement();
321             tmp++;
322         }
323
324         return strArr;
325     }
326
327     private String JavaDoc createAbsoluteUrl(String JavaDoc newLink, String JavaDoc baseUrl)
328     {
329         newLink = clear(newLink);
330
331         if(newLink.startsWith(baseUrl))
332         {
333             return newLink;
334         }
335
336         if(newLink.startsWith("/") && (baseUrl.indexOf("/") > 0))
337         {
338             newLink = baseUrl.substring(0, baseUrl.indexOf("/")) + newLink;
339         }
340         else if(newLink.startsWith("/") && (baseUrl.indexOf("/") < 0))
341         {
342             newLink = baseUrl + newLink;
343         }
344         else if((newLink.indexOf(".") > 0))
345         {
346             int idx = newLink.indexOf("/");
347             String JavaDoc tmp = "";
348
349             if(idx >= 0)
350             {
351                 tmp = newLink.substring(0, idx);
352             }
353
354             if((tmp.indexOf(".") > 0))
355             {
356                 return clear(newLink);
357             }
358
359             if(baseUrl.endsWith("/"))
360             {
361                 newLink = baseUrl + newLink;
362             }
363             else
364             {
365                 newLink = baseUrl + "/" + newLink;
366             }
367         }
368
369         //Log.out("-> " + newLink);
370

371         return newLink;
372     }
373
374 }
375
376
377 class Getter
378 {
379     private String JavaDoc localDir = null;
380
381     public Getter(String JavaDoc localDir)
382     {
383         this.localDir = localDir;
384     }
385
386     public String JavaDoc fetch(String JavaDoc url)
387     {
388         try
389         {
390             String JavaDoc host = url.substring(0, url.indexOf("/"));
391             String JavaDoc wo = url.substring(url.indexOf("/"));
392             String JavaDoc result = "";
393
394             //Log.out(">> " + host + wo);
395

396             Socket JavaDoc deal = new Socket JavaDoc(host, 80);
397             deal.setSoTimeout(5000);
398
399             BufferedWriter JavaDoc out = new BufferedWriter JavaDoc(new OutputStreamWriter JavaDoc(deal.getOutputStream()));
400             BufferedReader JavaDoc in = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(deal.getInputStream()));
401
402             out.write("GET http://" + url + " HTTP/1.0\n\n");
403             out.flush();
404
405             int len = 0;
406
407             while(!in.ready() && (len < 5000))
408             {
409                 chill(100);
410                 len += 100;
411             }
412
413             while(in.ready())
414             {
415                 result = result + in.readLine();
416             }
417
418             out.close();
419             in.close();
420
421             return result;
422         }
423         catch(Exception JavaDoc ex)
424         {
425             if(!FileSearch.quiet) ex.printStackTrace();
426         }
427
428         return "";
429     }
430
431     public void fetch(String JavaDoc url, boolean force)
432     {
433         try
434         {
435             String JavaDoc host = url.substring(0, url.indexOf("/"));
436             String JavaDoc wo = url.substring(url.indexOf("/"));
437             String JavaDoc result = "";
438
439             if(!FileSearch.quiet) Log.debug(">>> " + host + wo);
440
441             //JFtp.statusP.jftp.ensureLogging();
442
File JavaDoc d = new File JavaDoc(localDir);
443             d.mkdir();
444
445             File JavaDoc f = new File JavaDoc(localDir + wo.substring(wo.lastIndexOf("/") + 1));
446
447             if(f.exists() && !force)
448             {
449                 if(!FileSearch.quiet) Log.debug(">>> file already exists...");
450
451                 return;
452             }
453             else
454             {
455                 f.delete();
456             }
457
458             Socket JavaDoc deal = new Socket JavaDoc(host, 80);
459             BufferedWriter JavaDoc out = new BufferedWriter JavaDoc(new OutputStreamWriter JavaDoc(deal.getOutputStream()));
460             DataInputStream JavaDoc in = new DataInputStream JavaDoc(new BufferedInputStream JavaDoc(deal.getInputStream()));
461
462             BufferedOutputStream JavaDoc localOut = new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(localDir +
463                                                                                         wo.substring(wo.lastIndexOf("/") +
464                                                                                                      1)));
465
466             byte[] alu = new byte[2048];
467
468             out.write("GET http://" + url + " HTTP/1.0\n\n");
469             out.flush();
470
471             boolean line = true;
472             boolean bin = false;
473
474             while(true)
475             {
476                 chill(10);
477
478                 String JavaDoc tmp = "";
479
480                 while(line)
481                 {
482                     String JavaDoc x = in.readLine();
483
484                     if(x == null)
485                     {
486                         break;
487                     }
488
489                     tmp += (x + "\n");
490
491                     if(x.equals(""))
492                     {
493                         line = false;
494                     }
495                 }
496
497                 int x = in.read(alu);
498
499                 if(x == -1)
500                 {
501                     if(line)
502                     {
503                         localOut.write(tmp.getBytes(), 0, tmp.length());
504                     }
505
506                     out.close();
507                     in.close();
508                     localOut.flush();
509                     localOut.close();
510
511                     return;
512                 }
513                 else
514                 {
515                     localOut.write(alu, 0, x);
516                 }
517             }
518         }
519         catch(Exception JavaDoc ex)
520         {
521             if(!FileSearch.quiet) ex.printStackTrace();
522         }
523     }
524
525     public static void chill(int time)
526     {
527         try
528         {
529             Thread.sleep(time);
530         }
531         catch(Exception JavaDoc ex)
532         {
533         }
534     }
535 }
536
Popular Tags