1 2 3 4 package net.nutch.parse.msword; 5 6 import net.nutch.protocol.ProtocolFactory; 7 import net.nutch.protocol.Protocol; 8 import net.nutch.protocol.Content; 9 import net.nutch.protocol.ProtocolException; 10 11 import net.nutch.parse.ParserFactory; 12 import net.nutch.parse.Parser; 13 import net.nutch.parse.Parse; 14 import net.nutch.parse.ParseException; 15 16 import junit.framework.TestCase; 17 18 23 public class TestMSWordParser extends TestCase { 24 25 private String fileSeparator = System.getProperty("file.separator"); 26 private String sampleDir = System.getProperty("test.data","."); 28 private String [] sampleFiles = {"word95.doc","word97.doc"}; 32 33 private String expectedText = "This is a sample doc file prepared for nutch."; 34 35 public TestMSWordParser(String name) { 36 super(name); 37 } 38 39 protected void setUp() {} 40 41 protected void tearDown() {} 42 43 public void testIt() throws ProtocolException, ParseException { 44 String urlString; 45 Protocol protocol; 46 Content content; 47 Parser parser; 48 Parse parse; 49 50 for (int i=0; i<sampleFiles.length; i++) { 51 urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; 52 53 protocol = ProtocolFactory.getProtocol(urlString); 54 content = protocol.getContent(urlString); 55 56 parser = ParserFactory.getParser(content.getContentType(), urlString); 57 parse = parser.getParse(content); 58 59 assertTrue(parse.getText().startsWith(expectedText)); 60 } 61 } 62 63 } 64 | Popular Tags |