KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > html > TestDOMContentUtils


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.html;
5
6 import junit.framework.TestCase;
7
8 import net.nutch.parse.Outlink;
9
10 import java.io.ByteArrayInputStream JavaDoc;
11 import java.net.MalformedURLException JavaDoc;
12 import java.net.URL JavaDoc;
13 import java.util.ArrayList JavaDoc;
14 import java.util.StringTokenizer JavaDoc;
15
16 import org.cyberneko.html.parsers.*;
17 import org.xml.sax.*;
18 import org.w3c.dom.*;
19 import org.apache.html.dom.*;
20
21 /**
22  * Unit tests for DOMContentUtils.
23  */

24 public class TestDOMContentUtils extends TestCase {
25
26   private static final String JavaDoc[] testPages= {
27     new String JavaDoc("<html><head><title> title </title><script> script </script>"
28                + "</head><body> body <a HREF=\"http://www.nutch.org\">"
29                + " anchor </a><!--comment-->"
30                + "</body></html>"),
31     new String JavaDoc("<html><head><title> title </title><script> script </script>"
32                + "</head><body> body <a HREF=\"/\">"
33                + " home </a><!--comment-->"
34                + "<style> style </style>"
35                + " <a HREF=\"bot.html\">"
36                + " bots </a>"
37                + "</body></html>"),
38     new String JavaDoc("<html><head><title> </title>"
39                + "</head><body> "
40                + "<a HREF=\"/\"> separate this "
41                + "<a HREF=\"ok\"> from this"
42                + "</a></a>"
43                + "</body></html>"),
44     // this one relies on certain neko fixup behavior, possibly
45
// distributing the anchors into the LI's-but not the other
46
// anchors (outside of them, instead)! So you get a tree that
47
// looks like:
48
// ... <li> <a HREF=/> home </a> </li>
49
// <li> <a HREF=/> <a HREF="1"> 1 </a> </a> </li>
50
// <li> <a HREF=/> <a HREF="1"> <a HREF="2"> 2 </a> </a> </a> </li>
51
new String JavaDoc("<html><head><title> my title </title>"
52                + "</head><body> body "
53                + "<ul>"
54                + "<li> <a HREF=\"/\"> home"
55                + "<li> <a HREF=\"1\"> 1"
56                + "<li> <a HREF=\"2\"> 2"
57                + "</ul>"
58                + "</body></html>"),
59     // test frameset link extraction. The invalid frame in the middle will be
60
// fixed to a third standalone frame.
61
new String JavaDoc("<html><head><title> my title </title>"
62                + "</head><frameset rows=\"20,*\"> "
63                + "<frame SRC=\"top.html\">"
64                + "</frame>"
65                + "<frameset cols=\"20,*\">"
66                + "<frame SRC=\"left.html\">"
67                + "<frame SRC=\"invalid.html\"/>"
68                + "</frame>"
69                + "<frame SRC=\"right.html\">"
70                + "</frame>"
71                + "</frameset>"
72                + "</frameset>"
73                + "</body></html>"),
74     // test <area> and <iframe> link extraction + url normalization
75
new String JavaDoc("<html><head><title> my title </title>"
76                + "</head><body>"
77                + "<img SRC=\"logo.gif\" usemap=\"#green\" border=\"0\">"
78                + "<map name=\"green\">"
79                + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" HREF=\"../index.html\">"
80                + "<area shape=\"rect\" coords=\"128,132,241,179\" HREF=\"#bottom\">"
81                + "<area shape=\"circle\" coords=\"68,211,35\" HREF=\"../bot.html\">"
82                + "</map>"
83                + "<a name=\"bottom\"/><h1> the bottom </h1> "
84                + "<iframe SRC=\"../docs/index.html\"/>"
85                + "</body></html>"),
86     // test whitespace processing for plain text extraction
87
new String JavaDoc("<html><head>\n <title> my\t\n title\r\n </title>\n"
88                + " </head>\n"
89                + " <body>\n"
90                + " <h1> Whitespace\ttest </h1> \n"
91                + "\t<a HREF=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
92                + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
93                + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
94                + " This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
95                + "<table>"
96                + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
97                + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
98                + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
99                + "</table>put some text here<Br>and there."
100                + "<h2>End\tthis\rmadness\n!</h2>\r\n"
101                + " . . . ."
102                + "</body> </html>"),
103   };
104
105   private static String JavaDoc[] testBaseHrefs= {
106     "http://www.nutch.org",
107     "http://www.nutch.org/docs/foo.html",
108     "http://www.nutch.org/docs/",
109     "http://www.nutch.org/docs/",
110     "http://www.nutch.org/frames/",
111     "http://www.nutch.org/maps/",
112     "http://www.nutch.org/whitespace/",
113   };
114   
115   private static final DocumentFragment testDOMs[]=
116     new DocumentFragment[testPages.length];
117
118   private static URL JavaDoc[] testBaseHrefURLs=
119     new URL JavaDoc[testPages.length];
120
121
122   private static final String JavaDoc[] answerText= {
123     "title body anchor",
124     "title body home bots",
125     "separate this from this",
126     "my title body home 1 2",
127     "my title",
128     "my title the bottom",
129     "my title Whitespace test whitespace test "
130         + "This is a whitespace test . Newlines should appear as space too. "
131         + "Tabs are spaces too. This is a break -> and the line after break . "
132         + "one two three space here space there no space "
133         + "one two two three three four put some text here and there. "
134         + "End this madness ! . . . .",
135   };
136
137   private static final String JavaDoc[] answerTitle= {
138     "title",
139     "title",
140     "",
141     "my title",
142     "my title",
143     "my title",
144     "my title",
145   };
146
147   // note: should be in page-order
148
private static Outlink[][] answerOutlinks;
149   
150   public TestDOMContentUtils(String JavaDoc name) {
151     super(name);
152   }
153
154   private static void setup() {
155     DOMFragmentParser parser= new DOMFragmentParser();
156     for (int i= 0; i < testPages.length; i++) {
157         DocumentFragment node=
158           new HTMLDocumentImpl().createDocumentFragment();
159         try {
160           parser.parse(
161             new InputSource(
162               new ByteArrayInputStream JavaDoc(testPages[i].getBytes()) ),
163             node);
164           testBaseHrefURLs[i]= new URL JavaDoc(testBaseHrefs[i]);
165         } catch (Exception JavaDoc e) {
166           assertTrue("caught exception: " + e, false);
167         }
168       testDOMs[i]= node;
169     }
170     try {
171      answerOutlinks = new Outlink[][]{
172          {
173            new Outlink("http://www.nutch.org", "anchor"),
174          },
175          {
176            new Outlink("http://www.nutch.org/", "home"),
177            new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
178          },
179          {
180            new Outlink("http://www.nutch.org/", "separate this"),
181            new Outlink("http://www.nutch.org/docs/ok", "from this"),
182          },
183          {
184            new Outlink("http://www.nutch.org/", "home"),
185            new Outlink("http://www.nutch.org/docs/1", "1"),
186            new Outlink("http://www.nutch.org/docs/2", "2"),
187          },
188          {
189            new Outlink("http://www.nutch.org/frames/top.html", ""),
190            new Outlink("http://www.nutch.org/frames/left.html", ""),
191            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
192            new Outlink("http://www.nutch.org/frames/right.html", ""),
193          },
194          {
195            new Outlink("http://www.nutch.org/index.html", ""),
196            new Outlink("http://www.nutch.org/maps/#bottom", ""),
197            new Outlink("http://www.nutch.org/bot.html", ""),
198            new Outlink("http://www.nutch.org/docs/index.html", ""),
199          },
200          {
201              new Outlink("http://www.nutch.org/index.html", "whitespace test"),
202          },
203       };
204    
205     } catch (MalformedURLException JavaDoc e) {
206         
207     }
208   }
209
210   private static boolean equalsIgnoreWhitespace(String JavaDoc s1, String JavaDoc s2) {
211     StringTokenizer JavaDoc st1= new StringTokenizer JavaDoc(s1);
212     StringTokenizer JavaDoc st2= new StringTokenizer JavaDoc(s2);
213
214     while (st1.hasMoreTokens()) {
215       if (!st2.hasMoreTokens())
216         return false;
217       if ( ! st1.nextToken().equals(st2.nextToken()) )
218         return false;
219     }
220     if (st2.hasMoreTokens())
221       return false;
222     return true;
223   }
224
225   public void testGetText() {
226     if (testDOMs[0] == null)
227       setup();
228     for (int i= 0; i < testPages.length; i++) {
229       StringBuffer JavaDoc sb= new StringBuffer JavaDoc();
230       DOMContentUtils.getText(sb, testDOMs[i]);
231       String JavaDoc text= sb.toString();
232       assertTrue("expecting text: " + answerText[i]
233                  + System.getProperty("line.separator")
234                  + System.getProperty("line.separator")
235                  + "got text: "+ text,
236                  equalsIgnoreWhitespace(answerText[i], text));
237     }
238   }
239
240   public void testGetTitle() {
241     if (testDOMs[0] == null)
242       setup();
243     for (int i= 0; i < testPages.length; i++) {
244       StringBuffer JavaDoc sb= new StringBuffer JavaDoc();
245       DOMContentUtils.getTitle(sb, testDOMs[i]);
246       String JavaDoc text= sb.toString();
247       assertTrue("expecting text: " + answerText[i]
248                  + System.getProperty("line.separator")
249                  + System.getProperty("line.separator")
250                  + "got text: "+ text,
251                  equalsIgnoreWhitespace(answerTitle[i], text));
252     }
253   }
254
255   public void testGetOutlinks() {
256     if (testDOMs[0] == null)
257       setup();
258     for (int i= 0; i < testPages.length; i++) {
259       ArrayList JavaDoc outlinks= new ArrayList JavaDoc();
260       DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
261       Outlink[] outlinkArr= new Outlink[outlinks.size()];
262       outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
263       compareOutlinks(answerOutlinks[i], outlinkArr);
264     }
265   }
266
267   private static final void appendOutlinks(StringBuffer JavaDoc sb, Outlink[] o) {
268     for (int i= 0; i < o.length; i++) {
269       sb.append(o[i].toString());
270       sb.append(System.getProperty("line.separator"));
271     }
272   }
273
274   private static final String JavaDoc outlinksString(Outlink[] o) {
275     StringBuffer JavaDoc sb= new StringBuffer JavaDoc();
276     appendOutlinks(sb, o);
277     return sb.toString();
278   }
279
280   private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
281     if (o1.length != o2.length) {
282       assertTrue("got wrong number of outlinks (expecting " + o1.length
283                  + ", got " + o2.length + ")"
284                  + System.getProperty("line.separator")
285                  + "answer: " + System.getProperty("line.separator")
286                  + outlinksString(o1)
287                  + System.getProperty("line.separator")
288                  + "got: " + System.getProperty("line.separator")
289                  + outlinksString(o2)
290                  + System.getProperty("line.separator"),
291                  false
292         );
293     }
294
295     for (int i= 0; i < o1.length; i++) {
296       if (!o1[i].equals(o2[i])) {
297         assertTrue("got wrong outlinks at position " + i
298                    + System.getProperty("line.separator")
299                    + "answer: " + System.getProperty("line.separator")
300                    + o1[i].toString()
301                    + System.getProperty("line.separator")
302                    + "got: " + System.getProperty("line.separator")
303                    + o2[i].toString(),
304                    false
305           );
306         
307       }
308     }
309   }
310 }
311
Popular Tags