KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > datamodel > Robotstxt


1 /* Robots.java
2  *
3  * $Id: Robotstxt.java,v 1.1.14.1 2007/01/13 01:31:12 stack-sf Exp $
4  *
5  * Created Sep 1, 2005
6  *
7  * Copyright (C) 2005 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.util.ArrayList JavaDoc;
30 import java.util.LinkedList JavaDoc;
31 import java.util.List JavaDoc;
32 import java.util.Map JavaDoc;
33
34 public class Robotstxt {
35     public static boolean parse(BufferedReader JavaDoc reader,
36             final LinkedList JavaDoc<String JavaDoc> userAgents,
37             final Map JavaDoc<String JavaDoc,List JavaDoc<String JavaDoc>> disallows)
38     throws IOException JavaDoc {
39         boolean hasErrors = false;
40         String JavaDoc read;
41         ArrayList JavaDoc<String JavaDoc> current = null;
42         String JavaDoc catchall = null;
43         while (reader != null) {
44             do {
45                 read = reader.readLine();
46                 // Skip comments & blanks
47
} while ((read != null) && ((read = read.trim()).startsWith("#") ||
48                 read.length() == 0));
49             if (read == null) {
50                 reader.close();
51                 reader = null;
52             } else {
53                 int commentIndex = read.indexOf("#");
54                 if (commentIndex > -1) {
55                     // Strip trailing comment
56
read = read.substring(0, commentIndex);
57                 }
58                 read = read.trim();
59                 if (read.matches("(?i)^User-agent:.*")) {
60                     String JavaDoc ua = read.substring(11).trim().toLowerCase();
61                     if (current == null || current.size() != 0) {
62                         // only create new rules-list if necessary
63
// otherwise share with previous user-agent
64
current = new ArrayList JavaDoc<String JavaDoc>();
65                     }
66                     if (ua.equals("*")) {
67                         ua = "";
68                         catchall = ua;
69                     } else {
70                         userAgents.addLast(ua);
71                     }
72                     disallows.put(ua, current);
73                     continue;
74                 }
75                 if (read.matches("(?i)Disallow:.*")) {
76                     if (current == null) {
77                         // buggy robots.txt
78
hasErrors = true;
79                         continue;
80                     }
81                     String JavaDoc path = read.substring(9).trim();
82                     current.add(path);
83                     continue;
84                 }
85                 // unknown line; do nothing for now
86
}
87         }
88
89         if (catchall != null) {
90             userAgents.addLast(catchall);
91         }
92         return hasErrors;
93     }
94
95     /**
96      * @param args Command-line arguments.
97      */

98     public static void main(String JavaDoc[] args) {
99         // TODO Auto-generated method stub
100
}
101 }
102
Popular Tags