LocalNutchInputComponent


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.clustering.carrot2;
5   
6   import java.io.StringReader  ;
7   import java.util.Arrays  ;
8   import java.util.HashSet  ;
9   import java.util.Set  ;
10  import java.util.Map  ;
11  
12  import org.apache.xerces.parsers.AbstractSAXParser;
13  import org.cyberneko.html.HTMLConfiguration;
14  import org.xml.sax.InputSource  ;
15  import org.xml.sax.SAXException  ;
16  import org.xml.sax.helpers.DefaultHandler  ;
17  
18  import net.nutch.searcher.HitDetails;
19  
20  import com.dawidweiss.carrot.core.local.LocalInputComponentBase;
21  import com.dawidweiss.carrot.core.local.ProcessingException;
22  import com.dawidweiss.carrot.core.local.RequestContext;
23  import com.dawidweiss.carrot.core.local.clustering.*;
24  
25  /**
26   * A local input component that ignores the query passed from the
27   * controller and instead looks for data stored in the request context.
28   * This enables us to reuse the same physical component implementation
29   * for data that has already been acquired from Nutch.    
30   *
31   * @author Dawid Weiss
32   * @version $Id: LocalNutchInputComponent.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
33   */
34  public class LocalNutchInputComponent extends LocalInputComponentBase {
35    public final static String   NUTCH_INPUT_HIT_DETAILS_ARRAY
36      = "NUTCH_INPUT_HIT_DETAILS_ARRAY";
37  
38    public final static String   NUTCH_INPUT_SUMMARIES_ARRAY 
39      = "NUTCH_INPUT_SUMMARIES_ARRAY";
40  
41    /** Capabilities required from the next component in the chain */
42    private final static Set   SUCCESSOR_CAPABILITIES 
43      = new HashSet  (Arrays.asList(new Object   [] { RawDocumentsConsumer.class }));
44  
45    /** This component's capabilities */
46    private final static Set   COMPONENT_CAPABILITIES 
47      = new HashSet  (Arrays.asList(new Object   [] { RawDocumentsProducer.class }));
48      
49    /*
50     * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
51     */
52    public void setQuery(String   query) {
53        // ignore the query; data will be provided from the request context.
54    }
55  
56    /** A callback hook that starts the processing. */
57    public void startProcessing(RequestContext context) throws ProcessingException {
58      // let successor components know that the processing has started.
59      super.startProcessing(context);
60      
61      // get the information about documents from the context.
62      Map   params = context.getRequestParameters();
63      HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
64      String   [] summaries = (String  []) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
65      
66      if (details == null)
67        throw new ProcessingException("Details array must not be null.");
68  
69      if (summaries == null)
70        throw new ProcessingException("Summaries array must not be null.");
71      
72      if (summaries.length != details.length)
73        throw new ProcessingException("Summaries and details must be of the same length.");
74      
75      RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
76      
77      // produce 'documents' for successor components.
78      for (int i=0;i<summaries.length;i++) {
79        consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i])));
80      }
81    }
82  
83    /**
84     * Returns the capabilities provided by this component.
85     */
86    public Set   getComponentCapabilities() {
87      return COMPONENT_CAPABILITIES;
88    }
89      
90    /**
91     * Returns the capabilities required from the successor component.
92     */
93    public Set   getRequiredSuccessorCapabilities() {
94      return SUCCESSOR_CAPABILITIES;
95    }
96  
97    // --- The methods below, plus dependency on the Nekohtml parser
98    // are only required because Nutch's summaries are in HTML by default.
99    // I guess it would be possible to get rid of the code below by
100   // adding patches/ methods to Nutch that return plain text summaries.
101   // 
102   // The temporary quick-and-dirty solution below has been provided by Doug, thanks. 
103 
104   /**
105    * The text buffer for plain text. 
106    */
107   private StringBuffer   textBuffer = new StringBuffer  ();
108     
109   /**
110    * A parser that will convert html to plain text.
111    */
112   private AbstractSAXParser parser;
113 
114   /*
115    * Anonymous initialization of the parser. Since we declared
116    * the current solution to be quick and dirty, it doesn't have
117    * to be in the constructor :)
118    */
119   {
120     try {
121       parser = new AbstractSAXParser(new HTMLConfiguration()){};
122       parser.setContentHandler(new DefaultHandler  () {
123           public void characters(char[] chars, int start, int length)
124             throws SAXException   {
125             textBuffer.append(chars, start, length);
126           }
127         });
128     } catch (Exception   e) {
129       throw new RuntimeException  (e.toString(), e);
130     }
131   }
132 
133   /**
134    * Converts a html chunk to plain text.
135    */
136   private final String   htmlToText(String   html) {
137     textBuffer.setLength(0);
138     try {
139       parser.parse(new InputSource  (new StringReader  (html)));
140     } catch (Exception   e) {                     // shouldn't happen
141       throw new RuntimeException  (e.toString(), e);
142     }
143     return textBuffer.toString();
144   }
145 }
146
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags