1 2 3 4 package net.nutch.clustering.carrot2; 5 6 import java.util.HashMap ; 7 import java.util.Map ; 8 import java.util.Set ; 9 import java.util.List ; 10 import java.util.Iterator ; 11 12 import net.nutch.clustering.HitsCluster; 13 import net.nutch.clustering.OnlineClusterer; 14 import net.nutch.searcher.HitDetails; 15 import com.dawidweiss.carrot.core.local.*; 16 import com.dawidweiss.carrot.core.local.clustering.RawCluster; 17 import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent; 18 import com.dawidweiss.carrot.util.tokenizer.SnippetTokenizerLocalFilterComponent; 19 import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent; 20 21 import com.dawidweiss.carrot.util.tokenizer.languages.dutch.Dutch; 22 import com.dawidweiss.carrot.util.tokenizer.languages.english.English; 23 import com.dawidweiss.carrot.util.tokenizer.languages.french.French; 24 import com.dawidweiss.carrot.util.tokenizer.languages.german.German; 25 import com.dawidweiss.carrot.util.tokenizer.languages.italian.Italian; 26 import com.dawidweiss.carrot.util.tokenizer.languages.spanish.Spanish; 27 import com.dawidweiss.carrot.core.local.linguistic.Language; 28 29 30 38 public class Clusterer implements OnlineClusterer { 39 private final LocalController controller; 40 41 45 public Clusterer() { 46 controller = new LocalControllerBase(); 47 addComponentFactories(); 48 addProcesses(); 49 } 50 51 52 private void addComponentFactories() { 53 LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() { 55 public LocalComponent getInstance() { 56 return new LocalNutchInputComponent(); 57 } 58 }; 59 controller.addLocalComponentFactory("input.localnutch", nutchInputFactory); 60 61 LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() { 63 public LocalComponent getInstance() { 64 return new ClustersConsumerOutputComponent(); 65 } 66 }; 67 controller.addLocalComponentFactory("output.cluster-consumer", 68 clusterConsumerOutputFactory); 69 70 LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() { 72 public LocalComponent getInstance() { 73 HashMap defaults = new HashMap (); 74 75 defaults.put("lsi.threshold.clusterAssignment", "0.150"); 79 defaults.put("lsi.threshold.candidateCluster", "0.775"); 80 81 return new LingoLocalFilterComponent( 85 new Language[] 90 { 91 new English(), 92 new Dutch(), 93 new French(), 94 new German(), 95 new Italian(), 96 new Spanish() 97 }, defaults); 98 } 99 }; 100 controller.addLocalComponentFactory("filter.lingo-old", lingoFactory); 101 } 102 103 104 private void addProcesses() { 105 LocalProcessBase lingoNMFKM3 106 = new LocalProcessBase( 107 "input.localnutch", 108 "output.cluster-consumer", 109 new String [] {"filter.lingo-old"}, 110 "Example the Lingo clustering algorithm.", 111 ""); 112 113 try { 114 controller.addProcess("lingo-nmf-km-3", lingoNMFKM3); 115 } catch (Exception e) { 116 throw new RuntimeException ("Could not assemble clustering process.", e); 117 } 118 } 119 120 123 public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) { 124 Map requestParams = new HashMap (); 125 requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY, 126 hitDetails); 127 requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY, 128 descriptions); 129 try { 130 ProcessingResult result = 131 controller.query("lingo-nmf-km-3", "pseudo-query", requestParams); 132 133 ClustersConsumerOutputComponent.Result output = 134 (ClustersConsumerOutputComponent.Result) result.getQueryResult(); 135 136 List outputClusters = output.clusters; 137 HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ]; 138 139 int j = 0; 140 for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) { 141 RawCluster rcluster = (RawCluster) i.next(); 142 clusters[j] = new HitsClusterAdapter(rcluster, hitDetails); 143 } 144 145 return clusters; 147 } catch (MissingProcessException e) { 148 throw new RuntimeException ("Missing clustering process.", e); 149 } catch (Exception e) { 150 throw new RuntimeException ("Unidentified problems with the clustering.", e); 151 } 152 } 153 } 154 | Popular Tags |