1 2 3 4 package net.nutch.parse.html; 5 6 import junit.framework.TestCase; 7 8 import net.nutch.parse.Outlink; 9 10 import java.io.ByteArrayInputStream ; 11 import java.net.MalformedURLException ; 12 import java.net.URL ; 13 import java.util.ArrayList ; 14 import java.util.StringTokenizer ; 15 16 import org.cyberneko.html.parsers.*; 17 import org.xml.sax.*; 18 import org.w3c.dom.*; 19 import org.apache.html.dom.*; 20 21 24 public class TestDOMContentUtils extends TestCase { 25 26 private static final String [] testPages= { 27 new String ("<html><head><title> title </title><script> script </script>" 28 + "</head><body> body <a HREF=\"http://www.nutch.org\">" 29 + " anchor </a><!--comment-->" 30 + "</body></html>"), 31 new String ("<html><head><title> title </title><script> script </script>" 32 + "</head><body> body <a HREF=\"/\">" 33 + " home </a><!--comment-->" 34 + "<style> style </style>" 35 + " <a HREF=\"bot.html\">" 36 + " bots </a>" 37 + "</body></html>"), 38 new String ("<html><head><title> </title>" 39 + "</head><body> " 40 + "<a HREF=\"/\"> separate this " 41 + "<a HREF=\"ok\"> from this" 42 + "</a></a>" 43 + "</body></html>"), 44 new String ("<html><head><title> my title </title>" 52 + "</head><body> body " 53 + "<ul>" 54 + "<li> <a HREF=\"/\"> home" 55 + "<li> <a HREF=\"1\"> 1" 56 + "<li> <a HREF=\"2\"> 2" 57 + "</ul>" 58 + "</body></html>"), 59 new String ("<html><head><title> my title </title>" 62 + "</head><frameset rows=\"20,*\"> " 63 + "<frame SRC=\"top.html\">" 64 + "</frame>" 65 + "<frameset cols=\"20,*\">" 66 + "<frame SRC=\"left.html\">" 67 + "<frame SRC=\"invalid.html\"/>" 68 + "</frame>" 69 + "<frame SRC=\"right.html\">" 70 + "</frame>" 71 + "</frameset>" 72 + "</frameset>" 73 + "</body></html>"), 74 new String ("<html><head><title> my title </title>" 76 + "</head><body>" 77 + "<img SRC=\"logo.gif\" usemap=\"#green\" border=\"0\">" 78 + "<map name=\"green\">" 79 + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" HREF=\"../index.html\">" 80 + "<area shape=\"rect\" coords=\"128,132,241,179\" HREF=\"#bottom\">" 81 + "<area shape=\"circle\" coords=\"68,211,35\" HREF=\"../bot.html\">" 82 + "</map>" 83 + "<a name=\"bottom\"/><h1> the bottom </h1> " 84 + "<iframe SRC=\"../docs/index.html\"/>" 85 + "</body></html>"), 86 new String ("<html><head>\n <title> my\t\n title\r\n </title>\n" 88 + " </head>\n" 89 + " <body>\n" 90 + " <h1> Whitespace\ttest </h1> \n" 91 + "\t<a HREF=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" 92 + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" 93 + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" 94 + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" 95 + "<table>" 96 + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" 97 + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" 98 + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" 99 + "</table>put some text here<Br>and there." 100 + "<h2>End\tthis\rmadness\n!</h2>\r\n" 101 + " . . . ." 102 + "</body> </html>"), 103 }; 104 105 private static String [] testBaseHrefs= { 106 "http://www.nutch.org", 107 "http://www.nutch.org/docs/foo.html", 108 "http://www.nutch.org/docs/", 109 "http://www.nutch.org/docs/", 110 "http://www.nutch.org/frames/", 111 "http://www.nutch.org/maps/", 112 "http://www.nutch.org/whitespace/", 113 }; 114 115 private static final DocumentFragment testDOMs[]= 116 new DocumentFragment[testPages.length]; 117 118 private static URL [] testBaseHrefURLs= 119 new URL [testPages.length]; 120 121 122 private static final String [] answerText= { 123 "title body anchor", 124 "title body home bots", 125 "separate this from this", 126 "my title body home 1 2", 127 "my title", 128 "my title the bottom", 129 "my title Whitespace test whitespace test " 130 + "This is a whitespace test . Newlines should appear as space too. " 131 + "Tabs are spaces too. This is a break -> and the line after break . " 132 + "one two three space here space there no space " 133 + "one two two three three four put some text here and there. " 134 + "End this madness ! . . . .", 135 }; 136 137 private static final String [] answerTitle= { 138 "title", 139 "title", 140 "", 141 "my title", 142 "my title", 143 "my title", 144 "my title", 145 }; 146 147 private static Outlink[][] answerOutlinks; 149 150 public TestDOMContentUtils(String name) { 151 super(name); 152 } 153 154 private static void setup() { 155 DOMFragmentParser parser= new DOMFragmentParser(); 156 for (int i= 0; i < testPages.length; i++) { 157 DocumentFragment node= 158 new HTMLDocumentImpl().createDocumentFragment(); 159 try { 160 parser.parse( 161 new InputSource( 162 new ByteArrayInputStream (testPages[i].getBytes()) ), 163 node); 164 testBaseHrefURLs[i]= new URL (testBaseHrefs[i]); 165 } catch (Exception e) { 166 assertTrue("caught exception: " + e, false); 167 } 168 testDOMs[i]= node; 169 } 170 try { 171 answerOutlinks = new Outlink[][]{ 172 { 173 new Outlink("http://www.nutch.org", "anchor"), 174 }, 175 { 176 new Outlink("http://www.nutch.org/", "home"), 177 new Outlink("http://www.nutch.org/docs/bot.html", "bots"), 178 }, 179 { 180 new Outlink("http://www.nutch.org/", "separate this"), 181 new Outlink("http://www.nutch.org/docs/ok", "from this"), 182 }, 183 { 184 new Outlink("http://www.nutch.org/", "home"), 185 new Outlink("http://www.nutch.org/docs/1", "1"), 186 new Outlink("http://www.nutch.org/docs/2", "2"), 187 }, 188 { 189 new Outlink("http://www.nutch.org/frames/top.html", ""), 190 new Outlink("http://www.nutch.org/frames/left.html", ""), 191 new Outlink("http://www.nutch.org/frames/invalid.html", ""), 192 new Outlink("http://www.nutch.org/frames/right.html", ""), 193 }, 194 { 195 new Outlink("http://www.nutch.org/index.html", ""), 196 new Outlink("http://www.nutch.org/maps/#bottom", ""), 197 new Outlink("http://www.nutch.org/bot.html", ""), 198 new Outlink("http://www.nutch.org/docs/index.html", ""), 199 }, 200 { 201 new Outlink("http://www.nutch.org/index.html", "whitespace test"), 202 }, 203 }; 204 205 } catch (MalformedURLException e) { 206 207 } 208 } 209 210 private static boolean equalsIgnoreWhitespace(String s1, String s2) { 211 StringTokenizer st1= new StringTokenizer (s1); 212 StringTokenizer st2= new StringTokenizer (s2); 213 214 while (st1.hasMoreTokens()) { 215 if (!st2.hasMoreTokens()) 216 return false; 217 if ( ! st1.nextToken().equals(st2.nextToken()) ) 218 return false; 219 } 220 if (st2.hasMoreTokens()) 221 return false; 222 return true; 223 } 224 225 public void testGetText() { 226 if (testDOMs[0] == null) 227 setup(); 228 for (int i= 0; i < testPages.length; i++) { 229 StringBuffer sb= new StringBuffer (); 230 DOMContentUtils.getText(sb, testDOMs[i]); 231 String text= sb.toString(); 232 assertTrue("expecting text: " + answerText[i] 233 + System.getProperty("line.separator") 234 + System.getProperty("line.separator") 235 + "got text: "+ text, 236 equalsIgnoreWhitespace(answerText[i], text)); 237 } 238 } 239 240 public void testGetTitle() { 241 if (testDOMs[0] == null) 242 setup(); 243 for (int i= 0; i < testPages.length; i++) { 244 StringBuffer sb= new StringBuffer (); 245 DOMContentUtils.getTitle(sb, testDOMs[i]); 246 String text= sb.toString(); 247 assertTrue("expecting text: " + answerText[i] 248 + System.getProperty("line.separator") 249 + System.getProperty("line.separator") 250 + "got text: "+ text, 251 equalsIgnoreWhitespace(answerTitle[i], text)); 252 } 253 } 254 255 public void testGetOutlinks() { 256 if (testDOMs[0] == null) 257 setup(); 258 for (int i= 0; i < testPages.length; i++) { 259 ArrayList outlinks= new ArrayList (); 260 DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); 261 Outlink[] outlinkArr= new Outlink[outlinks.size()]; 262 outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr); 263 compareOutlinks(answerOutlinks[i], outlinkArr); 264 } 265 } 266 267 private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { 268 for (int i= 0; i < o.length; i++) { 269 sb.append(o[i].toString()); 270 sb.append(System.getProperty("line.separator")); 271 } 272 } 273 274 private static final String outlinksString(Outlink[] o) { 275 StringBuffer sb= new StringBuffer (); 276 appendOutlinks(sb, o); 277 return sb.toString(); 278 } 279 280 private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { 281 if (o1.length != o2.length) { 282 assertTrue("got wrong number of outlinks (expecting " + o1.length 283 + ", got " + o2.length + ")" 284 + System.getProperty("line.separator") 285 + "answer: " + System.getProperty("line.separator") 286 + outlinksString(o1) 287 + System.getProperty("line.separator") 288 + "got: " + System.getProperty("line.separator") 289 + outlinksString(o2) 290 + System.getProperty("line.separator"), 291 false 292 ); 293 } 294 295 for (int i= 0; i < o1.length; i++) { 296 if (!o1[i].equals(o2[i])) { 297 assertTrue("got wrong outlinks at position " + i 298 + System.getProperty("line.separator") 299 + "answer: " + System.getProperty("line.separator") 300 + o1[i].toString() 301 + System.getProperty("line.separator") 302 + "got: " + System.getProperty("line.separator") 303 + o2[i].toString(), 304 false 305 ); 306 307 } 308 } 309 } 310 } 311 | Popular Tags |