KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > clustering > carrot2 > LocalNutchInputComponent


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.clustering.carrot2;
5
6 import java.io.StringReader JavaDoc;
7 import java.util.Arrays JavaDoc;
8 import java.util.HashSet JavaDoc;
9 import java.util.Set JavaDoc;
10 import java.util.Map JavaDoc;
11
12 import org.apache.xerces.parsers.AbstractSAXParser;
13 import org.cyberneko.html.HTMLConfiguration;
14 import org.xml.sax.InputSource JavaDoc;
15 import org.xml.sax.SAXException JavaDoc;
16 import org.xml.sax.helpers.DefaultHandler JavaDoc;
17
18 import net.nutch.searcher.HitDetails;
19
20 import com.dawidweiss.carrot.core.local.LocalInputComponentBase;
21 import com.dawidweiss.carrot.core.local.ProcessingException;
22 import com.dawidweiss.carrot.core.local.RequestContext;
23 import com.dawidweiss.carrot.core.local.clustering.*;
24
25 /**
26  * A local input component that ignores the query passed from the
27  * controller and instead looks for data stored in the request context.
28  * This enables us to reuse the same physical component implementation
29  * for data that has already been acquired from Nutch.
30  *
31  * @author Dawid Weiss
32  * @version $Id: LocalNutchInputComponent.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
33  */

34 public class LocalNutchInputComponent extends LocalInputComponentBase {
35   public final static String JavaDoc NUTCH_INPUT_HIT_DETAILS_ARRAY
36     = "NUTCH_INPUT_HIT_DETAILS_ARRAY";
37
38   public final static String JavaDoc NUTCH_INPUT_SUMMARIES_ARRAY
39     = "NUTCH_INPUT_SUMMARIES_ARRAY";
40
41   /** Capabilities required from the next component in the chain */
42   private final static Set JavaDoc SUCCESSOR_CAPABILITIES
43     = new HashSet JavaDoc(Arrays.asList(new Object JavaDoc [] { RawDocumentsConsumer.class }));
44
45   /** This component's capabilities */
46   private final static Set JavaDoc COMPONENT_CAPABILITIES
47     = new HashSet JavaDoc(Arrays.asList(new Object JavaDoc [] { RawDocumentsProducer.class }));
48     
49   /*
50    * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
51    */

52   public void setQuery(String JavaDoc query) {
53       // ignore the query; data will be provided from the request context.
54
}
55
56   /** A callback hook that starts the processing. */
57   public void startProcessing(RequestContext context) throws ProcessingException {
58     // let successor components know that the processing has started.
59
super.startProcessing(context);
60     
61     // get the information about documents from the context.
62
Map JavaDoc params = context.getRequestParameters();
63     HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
64     String JavaDoc [] summaries = (String JavaDoc[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
65     
66     if (details == null)
67       throw new ProcessingException("Details array must not be null.");
68
69     if (summaries == null)
70       throw new ProcessingException("Summaries array must not be null.");
71     
72     if (summaries.length != details.length)
73       throw new ProcessingException("Summaries and details must be of the same length.");
74     
75     RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
76     
77     // produce 'documents' for successor components.
78
for (int i=0;i<summaries.length;i++) {
79       consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i])));
80     }
81   }
82
83   /**
84    * Returns the capabilities provided by this component.
85    */

86   public Set JavaDoc getComponentCapabilities() {
87     return COMPONENT_CAPABILITIES;
88   }
89     
90   /**
91    * Returns the capabilities required from the successor component.
92    */

93   public Set JavaDoc getRequiredSuccessorCapabilities() {
94     return SUCCESSOR_CAPABILITIES;
95   }
96
97   // --- The methods below, plus dependency on the Nekohtml parser
98
// are only required because Nutch's summaries are in HTML by default.
99
// I guess it would be possible to get rid of the code below by
100
// adding patches/ methods to Nutch that return plain text summaries.
101
//
102
// The temporary quick-and-dirty solution below has been provided by Doug, thanks.
103

104   /**
105    * The text buffer for plain text.
106    */

107   private StringBuffer JavaDoc textBuffer = new StringBuffer JavaDoc();
108     
109   /**
110    * A parser that will convert html to plain text.
111    */

112   private AbstractSAXParser parser;
113
114   /*
115    * Anonymous initialization of the parser. Since we declared
116    * the current solution to be quick and dirty, it doesn't have
117    * to be in the constructor :)
118    */

119   {
120     try {
121       parser = new AbstractSAXParser(new HTMLConfiguration()){};
122       parser.setContentHandler(new DefaultHandler JavaDoc() {
123           public void characters(char[] chars, int start, int length)
124             throws SAXException JavaDoc {
125             textBuffer.append(chars, start, length);
126           }
127         });
128     } catch (Exception JavaDoc e) {
129       throw new RuntimeException JavaDoc(e.toString(), e);
130     }
131   }
132
133   /**
134    * Converts a html chunk to plain text.
135    */

136   private final String JavaDoc htmlToText(String JavaDoc html) {
137     textBuffer.setLength(0);
138     try {
139       parser.parse(new InputSource JavaDoc(new StringReader JavaDoc(html)));
140     } catch (Exception JavaDoc e) { // shouldn't happen
141
throw new RuntimeException JavaDoc(e.toString(), e);
142     }
143     return textBuffer.toString();
144   }
145 }
146
Popular Tags