|                                                                                                              1
 2
 3
 4   package net.nutch.parse.msword;
 5
 6   import net.nutch.protocol.ProtocolFactory;
 7   import net.nutch.protocol.Protocol;
 8   import net.nutch.protocol.Content;
 9   import net.nutch.protocol.ProtocolException;
 10
 11  import net.nutch.parse.ParserFactory;
 12  import net.nutch.parse.Parser;
 13  import net.nutch.parse.Parse;
 14  import net.nutch.parse.ParseException;
 15
 16  import junit.framework.TestCase;
 17
 18
 23  public class TestMSWordParser extends TestCase {
 24
 25    private String
  fileSeparator = System.getProperty("file.separator"); 26      private String
  sampleDir = System.getProperty("test.data","."); 28          private String
  [] sampleFiles = {"word95.doc","word97.doc"}; 32
 33    private String
  expectedText = "This is a sample doc file prepared for nutch."; 34
 35    public TestMSWordParser(String
  name) { 36      super(name);
 37    }
 38
 39    protected void setUp() {}
 40
 41    protected void tearDown() {}
 42
 43    public void testIt() throws ProtocolException, ParseException {
 44      String
  urlString; 45      Protocol protocol;
 46      Content content;
 47      Parser parser;
 48      Parse parse;
 49
 50      for (int i=0; i<sampleFiles.length; i++) {
 51        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 52
 53        protocol = ProtocolFactory.getProtocol(urlString);
 54        content = protocol.getContent(urlString);
 55
 56        parser = ParserFactory.getParser(content.getContentType(), urlString);
 57        parse = parser.getParse(content);
 58
 59        assertTrue(parse.getText().startsWith(expectedText));
 60      }
 61    }
 62
 63  }
 64
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |