1 2 3 4 package net.nutch.parse; 5 6 import net.nutch.util.LogFormatter; 7 8 import net.nutch.protocol.ProtocolFactory; 9 import net.nutch.protocol.Protocol; 10 import net.nutch.protocol.Content; 11 12 import java.util.logging.Logger ; 13 14 19 20 public class ParserChecker { 21 22 public static final Logger LOG = 23 LogFormatter.getLogger("net.nutch.parse.ParserChecker"); 24 25 public ParserChecker() {} 26 27 public static void main(String [] args) throws Exception { 28 boolean dumpText = false; 29 boolean force = false; 30 String contentType = null; 31 String url = null; 32 33 String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url"; 34 35 if (args.length == 0) { 36 System.err.println(usage); 37 System.exit(-1); 38 } 39 40 for (int i = 0; i < args.length; i++) { 41 if (args[i].equals("-forceAs")) { 42 force = true; 43 contentType = args[++i]; 44 } else if (args[i].equals("-dumpText")) { 45 dumpText = true; 46 } else if (i != args.length-1) { 47 System.err.println(usage); 48 System.exit(-1); 49 } else { 50 url = args[i]; 51 } 52 } 53 54 LOG.info("fetching: "+url); 55 56 Protocol protocol = ProtocolFactory.getProtocol(url); 57 Content content = protocol.getContent(url); 58 59 if (force) { 60 content.setContentType(contentType); 61 } else { 62 contentType = content.getContentType(); 63 } 64 65 if (contentType == null) { 66 System.err.println(""); 67 System.exit(-1); 68 } 69 70 LOG.info("parsing: "+url); 71 LOG.info("contentType: "+contentType); 72 73 Parser parser = ParserFactory.getParser(contentType, url); 74 Parse parse = parser.getParse(content); 75 76 System.out.print("---------\nParseData\n---------\n"); 77 System.out.print(parse.getData().toString()); 78 if (dumpText) { 79 System.out.print("---------\nParseText\n---------\n"); 80 System.out.print(parse.getText()); 81 } 82 83 System.exit(0); 84 } 85 } 86 | Popular Tags |