KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > clustering > carrot2 > Clusterer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.clustering.carrot2;
5
6 import java.util.HashMap JavaDoc;
7 import java.util.Map JavaDoc;
8 import java.util.Set JavaDoc;
9 import java.util.List JavaDoc;
10 import java.util.Iterator JavaDoc;
11
12 import net.nutch.clustering.HitsCluster;
13 import net.nutch.clustering.OnlineClusterer;
14 import net.nutch.searcher.HitDetails;
15 import com.dawidweiss.carrot.core.local.*;
16 import com.dawidweiss.carrot.core.local.clustering.RawCluster;
17 import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
18 import com.dawidweiss.carrot.util.tokenizer.SnippetTokenizerLocalFilterComponent;
19 import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
20
21 import com.dawidweiss.carrot.util.tokenizer.languages.dutch.Dutch;
22 import com.dawidweiss.carrot.util.tokenizer.languages.english.English;
23 import com.dawidweiss.carrot.util.tokenizer.languages.french.French;
24 import com.dawidweiss.carrot.util.tokenizer.languages.german.German;
25 import com.dawidweiss.carrot.util.tokenizer.languages.italian.Italian;
26 import com.dawidweiss.carrot.util.tokenizer.languages.spanish.Spanish;
27 import com.dawidweiss.carrot.core.local.linguistic.Language;
28
29
30 /**
31  * An plugin providing an implementation of {@link OnlineClusterer} extension
32  * using clustering components of the Carrot2 project
33  * (<a HREF="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
34  *
35  * @author Dawid Weiss
36  * @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
37  */

38 public class Clusterer implements OnlineClusterer {
39   private final LocalController controller;
40
41   /**
42    * An empty public constructor for making new instances
43    * of the clusterer.
44    */

45   public Clusterer() {
46     controller = new LocalControllerBase();
47     addComponentFactories();
48     addProcesses();
49   }
50
51   /** Adds the required component factories to a local Carrot2 controller. */
52   private void addComponentFactories() {
53     // Local nutch input component
54
LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
55       public LocalComponent getInstance() {
56         return new LocalNutchInputComponent();
57       }
58     };
59     controller.addLocalComponentFactory("input.localnutch", nutchInputFactory);
60     
61     // Cluster consumer output component
62
LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
63       public LocalComponent getInstance() {
64         return new ClustersConsumerOutputComponent();
65       }
66     };
67     controller.addLocalComponentFactory("output.cluster-consumer",
68       clusterConsumerOutputFactory);
69     
70     // Clustering component here.
71
LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
72       public LocalComponent getInstance() {
73         HashMap JavaDoc defaults = new HashMap JavaDoc();
74         
75         // These are adjustments settings for the clustering algorithm...
76
// You can play with them, but the values below are our 'best guess'
77
// settings that we acquired experimentally.
78
defaults.put("lsi.threshold.clusterAssignment", "0.150");
79         defaults.put("lsi.threshold.candidateCluster", "0.775");
80
81         // TODO: this should be eventually replaced with documents from Nutch
82
// tagged with a language tag. There is no need to again determine
83
// the language of a document.
84
return new LingoLocalFilterComponent(
85           // If you want to include Polish in the list of supported languages,
86
// you have to download a separate Carrot2-component called
87
// carrot2-stemmer-lametyzator.jar, put it in classpath
88
// and add new Polish() below.
89
new Language[]
90           {
91             new English(),
92             new Dutch(),
93             new French(),
94             new German(),
95             new Italian(),
96             new Spanish()
97           }, defaults);
98       }
99     };
100     controller.addLocalComponentFactory("filter.lingo-old", lingoFactory);
101   }
102
103   /** Adds a clustering process to the local controller */
104   private void addProcesses() {
105     LocalProcessBase lingoNMFKM3
106       = new LocalProcessBase(
107         "input.localnutch",
108         "output.cluster-consumer",
109         new String JavaDoc [] {"filter.lingo-old"},
110         "Example the Lingo clustering algorithm.",
111         "");
112
113     try {
114       controller.addProcess("lingo-nmf-km-3", lingoNMFKM3);
115     } catch (Exception JavaDoc e) {
116       throw new RuntimeException JavaDoc("Could not assemble clustering process.", e);
117     }
118   }
119   
120   /**
121    * See {@link OnlineClusterer} for documentation.
122    */

123   public HitsCluster [] clusterHits(HitDetails [] hitDetails, String JavaDoc [] descriptions) {
124     Map JavaDoc requestParams = new HashMap JavaDoc();
125     requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
126       hitDetails);
127     requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
128       descriptions);
129     try {
130       ProcessingResult result =
131         controller.query("lingo-nmf-km-3", "pseudo-query", requestParams);
132
133       ClustersConsumerOutputComponent.Result output =
134         (ClustersConsumerOutputComponent.Result) result.getQueryResult();
135
136       List JavaDoc outputClusters = output.clusters;
137       HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
138
139       int j = 0;
140       for (Iterator JavaDoc i = outputClusters.iterator(); i.hasNext(); j++) {
141         RawCluster rcluster = (RawCluster) i.next();
142         clusters[j] = new HitsClusterAdapter(rcluster, hitDetails);
143       }
144
145       // invoke Carrot2 process here.
146
return clusters;
147     } catch (MissingProcessException e) {
148       throw new RuntimeException JavaDoc("Missing clustering process.", e);
149     } catch (Exception JavaDoc e) {
150       throw new RuntimeException JavaDoc("Unidentified problems with the clustering.", e);
151     }
152   }
153 }
154
Popular Tags