1 2 3 4 package net.nutch.clustering.carrot2; 5 6 import java.io.StringReader ; 7 import java.util.Arrays ; 8 import java.util.HashSet ; 9 import java.util.Set ; 10 import java.util.Map ; 11 12 import org.apache.xerces.parsers.AbstractSAXParser; 13 import org.cyberneko.html.HTMLConfiguration; 14 import org.xml.sax.InputSource ; 15 import org.xml.sax.SAXException ; 16 import org.xml.sax.helpers.DefaultHandler ; 17 18 import net.nutch.searcher.HitDetails; 19 20 import com.dawidweiss.carrot.core.local.LocalInputComponentBase; 21 import com.dawidweiss.carrot.core.local.ProcessingException; 22 import com.dawidweiss.carrot.core.local.RequestContext; 23 import com.dawidweiss.carrot.core.local.clustering.*; 24 25 34 public class LocalNutchInputComponent extends LocalInputComponentBase { 35 public final static String NUTCH_INPUT_HIT_DETAILS_ARRAY 36 = "NUTCH_INPUT_HIT_DETAILS_ARRAY"; 37 38 public final static String NUTCH_INPUT_SUMMARIES_ARRAY 39 = "NUTCH_INPUT_SUMMARIES_ARRAY"; 40 41 42 private final static Set SUCCESSOR_CAPABILITIES 43 = new HashSet (Arrays.asList(new Object [] { RawDocumentsConsumer.class })); 44 45 46 private final static Set COMPONENT_CAPABILITIES 47 = new HashSet (Arrays.asList(new Object [] { RawDocumentsProducer.class })); 48 49 52 public void setQuery(String query) { 53 } 55 56 57 public void startProcessing(RequestContext context) throws ProcessingException { 58 super.startProcessing(context); 60 61 Map params = context.getRequestParameters(); 63 HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY); 64 String [] summaries = (String []) params.get(NUTCH_INPUT_SUMMARIES_ARRAY); 65 66 if (details == null) 67 throw new ProcessingException("Details array must not be null."); 68 69 if (summaries == null) 70 throw new ProcessingException("Summaries array must not be null."); 71 72 if (summaries.length != details.length) 73 throw new ProcessingException("Summaries and details must be of the same length."); 74 75 RawDocumentsConsumer consumer = (RawDocumentsConsumer) next; 76 77 for (int i=0;i<summaries.length;i++) { 79 consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i]))); 80 } 81 } 82 83 86 public Set getComponentCapabilities() { 87 return COMPONENT_CAPABILITIES; 88 } 89 90 93 public Set getRequiredSuccessorCapabilities() { 94 return SUCCESSOR_CAPABILITIES; 95 } 96 97 104 107 private StringBuffer textBuffer = new StringBuffer (); 108 109 112 private AbstractSAXParser parser; 113 114 119 { 120 try { 121 parser = new AbstractSAXParser(new HTMLConfiguration()){}; 122 parser.setContentHandler(new DefaultHandler () { 123 public void characters(char[] chars, int start, int length) 124 throws SAXException { 125 textBuffer.append(chars, start, length); 126 } 127 }); 128 } catch (Exception e) { 129 throw new RuntimeException (e.toString(), e); 130 } 131 } 132 133 136 private final String htmlToText(String html) { 137 textBuffer.setLength(0); 138 try { 139 parser.parse(new InputSource (new StringReader (html))); 140 } catch (Exception e) { throw new RuntimeException (e.toString(), e); 142 } 143 return textBuffer.toString(); 144 } 145 } 146 | Popular Tags |