KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > lang > TestHTMLLanguageParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.analysis.lang;
4
5 import java.util.Properties JavaDoc;
6
7 import junit.framework.TestCase;
8 import net.nutch.parse.Parse;
9 import net.nutch.parse.Parser;
10 import net.nutch.parse.ParserFactory;
11 import net.nutch.protocol.Content;
12
13 public class TestHTMLLanguageParser extends TestCase {
14
15   private static String JavaDoc URL = "http://foo.bar/";
16
17   private static String JavaDoc BASE = "http://foo.bar/";
18
19   String JavaDoc docs[] = {
20       "<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>",
21       "<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>",
22       "<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" };
23
24   String JavaDoc metalanguages[] = { "fi", "en", "en" };
25
26   /**
27    * Test parsing of language identifiers from html
28    **/

29   public void testMetaHTMLParsing() {
30
31     try {
32
33       /* loop through the test documents and validate result */
34       for (int t = 0; t < docs.length; t++) {
35
36         Content content = getContent(docs[t]);
37         Parser parser = ParserFactory.getParser("text/html", URL);
38         Parse parse = parser.getParse(content);
39
40         assertEquals(metalanguages[t], (String JavaDoc) parse.getData().get(
41             HTMLLanguageParser.META_LANG_NAME));
42
43       }
44     } catch (Exception JavaDoc e) {
45       e.printStackTrace(System.out);
46       fail(e.toString());
47     }
48
49   }
50
51   private Content getContent(String JavaDoc text) {
52     Properties JavaDoc p = new Properties JavaDoc();
53     p.put("Content-Type", "text/html");
54
55     Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
56     return content;
57   }
58 }
Popular Tags