KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > html > TestRobotsMetaProcessor


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.html;
5
6 import junit.framework.TestCase;
7
8 import net.nutch.parse.html.RobotsMetaProcessor.*;
9
10 import java.io.ByteArrayInputStream JavaDoc;
11 import java.net.URL JavaDoc;
12
13 import org.cyberneko.html.parsers.*;
14 import org.xml.sax.*;
15 import org.w3c.dom.*;
16 import org.apache.html.dom.*;
17
18 /** Unit tests for RobotsMetaProcessor. */
19 public class TestRobotsMetaProcessor extends TestCase {
20   public TestRobotsMetaProcessor(String JavaDoc name) {
21     super(name);
22   }
23
24   /*
25
26   some sample tags:
27
28   <meta name="robots" content="index,follow">
29   <meta name="robots" content="noindex,follow">
30   <meta name="robots" content="index,nofollow">
31   <meta name="robots" content="noindex,nofollow">
32
33   <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
34
35   */

36
37
38   public static String JavaDoc[] tests=
39   {
40     "<html><head><title>test page</title>"
41     + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
42     + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
43     + "</head><body>"
44     + " some text"
45     + "</body></html>",
46
47     "<html><head><title>test page</title>"
48     + "<meta name=\"robots\" content=\"all\"> "
49     + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
50     + "</head><body>"
51     + " some text"
52     + "</body></html>",
53
54     "<html><head><title>test page</title>"
55     + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
56     + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
57     + "</head><body>"
58     + " some text"
59     + "</body></html>",
60
61     "<html><head><title>test page</title>"
62     + "<meta name=\"robots\" content=\"none\"> "
63     + "</head><body>"
64     + " some text"
65     + "</body></html>",
66
67     "<html><head><title>test page</title>"
68     + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
69     + "</head><body>"
70     + " some text"
71     + "</body></html>",
72
73     "<html><head><title>test page</title>"
74     + "<meta name=\"robots\" content=\"noindex,follow\"> "
75     + "</head><body>"
76     + " some text"
77     + "</body></html>",
78
79     "<html><head><title>test page</title>"
80     + "<meta name=\"robots\" content=\"index,nofollow\"> "
81     + "</head><body>"
82     + " some text"
83     + "</body></html>",
84
85     "<html><head><title>test page</title>"
86     + "<meta name=\"robots\" content=\"index,follow\"> "
87     + "<base HREF=\"http://www.nutch.org/\">"
88     + "</head><body>"
89     + " some text"
90     + "</body></html>",
91
92     "<html><head><title>test page</title>"
93     + "<meta name=\"robots\"> "
94     + "<base HREF=\"http://www.nutch.org/base/\">"
95     + "</head><body>"
96     + " some text"
97     + "</body></html>",
98
99   };
100
101   public static final boolean[][] answers= {
102     {true, true, true}, // NONE
103
{false, false, true}, // all
104
{true, true, true}, // nOnE
105
{true, true, false}, // none
106
{true, true, false}, // noindex,nofollow
107
{true, false, false}, // noindex,follow
108
{false, true, false}, // index,nofollow
109
{false, false, false}, // index,follow
110
{false, false, false}, // missing!
111
};
112
113   private URL JavaDoc[][] currURLsAndAnswers;
114
115   public void testRobotsMetaProcessor() {
116     DOMFragmentParser parser= new DOMFragmentParser();;
117
118     try {
119       currURLsAndAnswers= new URL JavaDoc[][] {
120         {new URL JavaDoc("http://www.nutch.org"), null},
121         {new URL JavaDoc("http://www.nutch.org"), null},
122         {new URL JavaDoc("http://www.nutch.org"), null},
123         {new URL JavaDoc("http://www.nutch.org"), null},
124         {new URL JavaDoc("http://www.nutch.org"), null},
125         {new URL JavaDoc("http://www.nutch.org"), null},
126         {new URL JavaDoc("http://www.nutch.org"), null},
127         {new URL JavaDoc("http://www.nutch.org/foo/"),
128          new URL JavaDoc("http://www.nutch.org/")},
129         {new URL JavaDoc("http://www.nutch.org"),
130          new URL JavaDoc("http://www.nutch.org/base/")}
131       };
132     } catch (Exception JavaDoc e) {
133       assertTrue("couldn't make test URLs!", false);
134     }
135
136     for (int i= 0; i < tests.length; i++) {
137       byte[] bytes= tests[i].getBytes();
138
139       DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
140
141       try {
142         parser.parse(new InputSource(new ByteArrayInputStream JavaDoc(bytes)), node);
143       } catch (Exception JavaDoc e) {
144         e.printStackTrace();
145       }
146
147       RobotsMetaIndicator robotsMeta= new RobotsMetaIndicator();
148       RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node,
149                                                   currURLsAndAnswers[i][0]);
150
151       assertTrue("got index wrong on test " + i,
152                  robotsMeta.getNoIndex() == answers[i][0]);
153       assertTrue("got follow wrong on test " + i,
154                  robotsMeta.getNoFollow() == answers[i][1]);
155       assertTrue("got cache wrong on test " + i,
156                  robotsMeta.getNoCache() == answers[i][2]);
157       assertTrue("got base href wrong on test " + i + " (got "
158                  + robotsMeta.getBaseHref() + ")",
159                  ( (robotsMeta.getBaseHref() == null)
160                     && (currURLsAndAnswers[i][1] == null) )
161                  || ( (robotsMeta.getBaseHref() != null)
162                       && robotsMeta.getBaseHref().equals(
163                         currURLsAndAnswers[i][1]) ) );
164       
165     }
166   }
167
168 }
169
Popular Tags