1 2 3 package net.nutch.analysis.lang; 4 5 import java.util.Properties ; 6 7 import junit.framework.TestCase; 8 import net.nutch.parse.Parse; 9 import net.nutch.parse.Parser; 10 import net.nutch.parse.ParserFactory; 11 import net.nutch.protocol.Content; 12 13 public class TestHTMLLanguageParser extends TestCase { 14 15 private static String URL = "http://foo.bar/"; 16 17 private static String BASE = "http://foo.bar/"; 18 19 String docs[] = { 20 "<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>", 21 "<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>", 22 "<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" }; 23 24 String metalanguages[] = { "fi", "en", "en" }; 25 26 29 public void testMetaHTMLParsing() { 30 31 try { 32 33 34 for (int t = 0; t < docs.length; t++) { 35 36 Content content = getContent(docs[t]); 37 Parser parser = ParserFactory.getParser("text/html", URL); 38 Parse parse = parser.getParse(content); 39 40 assertEquals(metalanguages[t], (String ) parse.getData().get( 41 HTMLLanguageParser.META_LANG_NAME)); 42 43 } 44 } catch (Exception e) { 45 e.printStackTrace(System.out); 46 fail(e.toString()); 47 } 48 49 } 50 51 private Content getContent(String text) { 52 Properties p = new Properties (); 53 p.put("Content-Type", "text/html"); 54 55 Content content = new Content(URL, BASE, text.getBytes(), "text/html", p); 56 return content; 57 } 58 } | Popular Tags |