KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > openharmonise > rm > search > HarmoniseAnalyzer


1 /*
2  * The contents of this file are subject to the
3  * Mozilla Public License Version 1.1 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at http://www.mozilla.org/MPL/
6  *
7  * Software distributed under the License is distributed on an "AS IS"
8  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
9  * See the License for the specific language governing rights and
10  * limitations under the License.
11  *
12  * The Initial Developer of the Original Code is Simulacra Media Ltd.
13  * Portions created by Simulacra Media Ltd are Copyright (C) Simulacra Media Ltd, 2004.
14  *
15  * All Rights Reserved.
16  *
17  * Contributor(s):
18  */

19 package org.openharmonise.rm.search;
20 import java.io.*;
21 import java.util.*;
22
23 import org.apache.lucene.analysis.*;
24
25 /**
26  * <p>Extension of Lucene <code>Analyzer</code> for use with Harmonise.
27  *
28  * <p>This is a simple extension of the Lucene <code>Analyzer</code> class to meet the
29  * requirements for more sophisticated text indexing, in particular for use with the Becta
30  * VocabManager application.</p>
31  *
32  * <p>It provides an implementation that uses Lucene's built in implementation of the Porter
33  * stemming algorithm based on lower case tokenization of the input text.</p>
34  *
35  * <p><em>Note:</em> this algorithm is designed specifically for English language text. The stemmer
36  * works uses basic knowledge of English morphology. It is <em>not</em> suitable for other
37  * human languages!</p>
38  *
39  * @author John King
40  * @version $Revision: 1.3 $
41  *
42  */

43 public class HarmoniseAnalyzer extends Analyzer {
44
45     private Set stopWordsSet;
46
47     public HarmoniseAnalyzer() {
48         stopWordsSet = StopFilter.makeStopSet(SMART_STOP_WORDS);
49     }
50
51     /**
52      * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
53      */

54     public TokenStream tokenStream(String JavaDoc fieldName, Reader reader) {
55         return new PorterStemFilter(new StopFilter(new LowerCaseTokenizer(reader), stopWordsSet));
56     }
57     
58     /**
59      * Returns <code>true</code> if the specified <code>String</code>
60      * contains a stop word
61      *
62      * @param sArg a <code>String</code> to be inspected for stop words
63      * @return<code>true</code> if the specified <code>String</code>
64      * contains a stop word
65      */

66     public static boolean containsStopWord(String JavaDoc sArg) {
67         boolean bContainsStop = false;
68         
69         List list = Arrays.asList(SMART_STOP_WORDS);
70         
71         StringTokenizer tokenizer = new StringTokenizer(sArg);
72         
73         while (tokenizer.hasMoreTokens()) {
74             String JavaDoc sToken = tokenizer.nextToken();
75             
76             if(list.contains(sToken) == true) {
77                 bContainsStop = true;
78                 break;
79             }
80         }
81         
82         return bContainsStop;
83     }
84     
85     /**
86      * Returns the array of stop words used by this <code>Analyzer</code>
87      *
88      * @return the array of stop words used by this <code>Analyzer</code>
89      */

90     public static String JavaDoc[] getStopWords() {
91         return SMART_STOP_WORDS;
92     }
93
94     /*
95      * The list of stop words below is taken from http://www.onjava.com/onjava/2003/01/15/examples/EnglishStopWords.txt
96      * as suggested by Otis Gospodnetic's article at http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html?page=1
97      *
98      * All licensed as per Apache License, and therefore ok for inclusion here.
99      */

100
101     // Contributed to Lucene on 2002-09-21 by "John Caron" <caron@unidata.ucar.edu>
102
//
103
// See also:
104
// org.apache.lucene.analysis.StopFilter.makeStopTable()
105
// ftp://ftp.cs.cornell.edu/pub/smart/
106

107     private static final String JavaDoc SMART_STOP_WORDS[] =
108         {
109             "a",
110             "able",
111             "about",
112             "above",
113             "according",
114             "accordingly",
115             "across",
116             "actually",
117             "after",
118             "afterwards",
119             "again",
120             "against",
121             "all",
122             "allow",
123             "allows",
124             "almost",
125             "alone",
126             "along",
127             "already",
128             "also",
129             "although",
130             "always",
131             "am",
132             "among",
133             "amongst",
134             "an",
135             "and",
136             "another",
137             "any",
138             "anybody",
139             "anyhow",
140             "anyone",
141             "anything",
142             "anyway",
143             "anyways",
144             "anywhere",
145             "apart",
146             "appear",
147             "appreciate",
148             "appropriate",
149             "are",
150             "around",
151             "as",
152             "aside",
153             "ask",
154             "asking",
155             "associated",
156             "at",
157             "available",
158             "away",
159             "awfully",
160             "b",
161             "be",
162             "became",
163             "because",
164             "become",
165             "becomes",
166             "becoming",
167             "been",
168             "before",
169             "beforehand",
170             "behind",
171             "being",
172             "believe",
173             "below",
174             "beside",
175             "besides",
176             "best",
177             "better",
178             "between",
179             "beyond",
180             "both",
181             "brief",
182             "but",
183             "by",
184             "c",
185             "came",
186             "can",
187             "cannot",
188             "cant",
189             "cause",
190             "causes",
191             "certain",
192             "certainly",
193             "changes",
194             "clearly",
195             "co",
196             "com",
197             "come",
198             "comes",
199             "concerning",
200             "consequently",
201             "consider",
202             "considering",
203             "contain",
204             "containing",
205             "contains",
206             "corresponding",
207             "could",
208             "course",
209             "currently",
210             "d",
211             "definitely",
212             "described",
213             "despite",
214             "did",
215             "different",
216             "do",
217             "does",
218             "doing",
219             "done",
220             "down",
221             "downwards",
222             "during",
223             "e",
224             "each",
225             "edu",
226             "eg",
227             "eight",
228             "either",
229             "else",
230             "elsewhere",
231             "enough",
232             "entirely",
233             "especially",
234             "et",
235             "etc",
236             "even",
237             "ever",
238             "every",
239             "everybody",
240             "everyone",
241             "everything",
242             "everywhere",
243             "ex",
244             "exactly",
245             "example",
246             "except",
247             "f",
248             "far",
249             "few",
250             "fifth",
251             "first",
252             "five",
253             "followed",
254             "following",
255             "follows",
256             "for",
257             "former",
258             "formerly",
259             "forth",
260             "four",
261             "from",
262             "further",
263             "furthermore",
264             "g",
265             "get",
266             "gets",
267             "getting",
268             "given",
269             "gives",
270             "go",
271             "goes",
272             "going",
273             "gone",
274             "got",
275             "gotten",
276             "greetings",
277             "h",
278             "had",
279             "happens",
280             "hardly",
281             "has",
282             "have",
283             "having",
284             "he",
285             "hello",
286             "help",
287             "hence",
288             "her",
289             "here",
290             "hereafter",
291             "hereby",
292             "herein",
293             "hereupon",
294             "hers",
295             "herself",
296             "hi",
297             "him",
298             "himself",
299             "his",
300             "hither",
301             "hopefully",
302             "how",
303             "howbeit",
304             "however",
305             "i",
306             "ie",
307             "if",
308             "ignored",
309             "immediate",
310             "in",
311             "inasmuch",
312             "inc",
313             "indeed",
314             "indicate",
315             "indicated",
316             "indicates",
317             "inner",
318             "insofar",
319             "instead",
320             "into",
321             "inward",
322             "is",
323             "it",
324             "its",
325             "itself",
326             "j",
327             "just",
328             "k",
329             "keep",
330             "keeps",
331             "kept",
332             "know",
333             "knows",
334             "known",
335             "l",
336             "last",
337             "lately",
338             "later",
339             "latter",
340             "latterly",
341             "least",
342             "less",
343             "lest",
344             "let",
345             "like",
346             "liked",
347             "likely",
348             "little",
349             "look",
350             "looking",
351             "looks",
352             "ltd",
353             "m",
354             "mainly",
355             "many",
356             "may",
357             "maybe",
358             "me",
359             "mean",
360             "meanwhile",
361             "merely",
362             "might",
363             "more",
364             "moreover",
365             "most",
366             "mostly",
367             "much",
368             "must",
369             "my",
370             "myself",
371             "n",
372             "name",
373             "namely",
374             "nd",
375             "near",
376             "nearly",
377             "necessary",
378             "need",
379             "needs",
380             "neither",
381             "never",
382             "nevertheless",
383             "new",
384             "next",
385             "nine",
386             "no",
387             "nobody",
388             "non",
389             "none",
390             "noone",
391             "nor",
392             "normally",
393             "not",
394             "nothing",
395             "novel",
396             "now",
397             "nowhere",
398             "o",
399             "obviously",
400             "of",
401             "off",
402             "often",
403             "oh",
404             "ok",
405             "okay",
406             "old",
407             "on",
408             "once",
409             "one",
410             "ones",
411             "only",
412             "onto",
413             "or",
414             "other",
415             "others",
416             "otherwise",
417             "ought",
418             "our",
419             "ours",
420             "ourselves",
421             "out",
422             "outside",
423             "over",
424             "overall",
425             "own",
426             "p",
427             "particular",
428             "particularly",
429             "per",
430             "perhaps",
431             "placed",
432             "please",
433             "plus",
434             "possible",
435             "presumably",
436             "probably",
437             "provides",
438             "q",
439             "que",
440             "quite",
441             "qv",
442             "r",
443             "rather",
444             "rd",
445             "re",
446             "really",
447             "reasonably",
448             "regarding",
449             "regardless",
450             "regards",
451             "relatively",
452             "respectively",
453             "right",
454             "s",
455             "said",
456             "same",
457             "saw",
458             "say",
459             "saying",
460             "says",
461             "second",
462             "secondly",
463             "see",
464             "seeing",
465             "seem",
466             "seemed",
467             "seeming",
468             "seems",
469             "seen",
470             "self",
471             "selves",
472             "sensible",
473             "sent",
474             "serious",
475             "seriously",
476             "seven",
477             "several",
478             "shall",
479             "she",
480             "should",
481             "since",
482             "six",
483             "so",
484             "some",
485             "somebody",
486             "somehow",
487             "someone",
488             "something",
489             "sometime",
490             "sometimes",
491             "somewhat",
492             "somewhere",
493             "soon",
494             "sorry",
495             "specified",
496             "specify",
497             "specifying",
498             "still",
499             "sub",
500             "such",
501             "sup",
502             "sure",
503             "t",
504             "take",
505             "taken",
506             "tell",
507             "tends",
508             "th",
509             "than",
510             "thank",
511             "thanks",
512             "thanx",
513             "that",
514             "thats",
515             "the",
516             "their",
517             "theirs",
518             "them",
519             "themselves",
520             "then",
521             "thence",
522             "there",
523             "thereafter",
524             "thereby",
525             "therefore",
526             "therein",
527             "theres",
528             "thereupon",
529             "these",
530             "they",
531             "think",
532             "third",
533             "this",
534             "thorough",
535             "thoroughly",
536             "those",
537             "though",
538             "three",
539             "through",
540             "throughout",
541             "thru",
542             "thus",
543             "to",
544             "together",
545             "too",
546             "took",
547             "toward",
548             "towards",
549             "tried",
550             "tries",
551             "truly",
552             "try",
553             "trying",
554             "twice",
555             "two",
556             "u",
557             "un",
558             "under",
559             "unfortunately",
560             "unless",
561             "unlikely",
562             "until",
563             "unto",
564             "up",
565             "upon",
566             "us",
567             "use",
568             "used",
569             "useful",
570             "uses",
571             "using",
572             "usually",
573             "uucp",
574             "v",
575             "value",
576             "various",
577             "very",
578             "via",
579             "viz",
580             "vs",
581             "w",
582             "want",
583             "wants",
584             "was",
585             "way",
586             "we",
587             "welcome",
588             "well",
589             "went",
590             "were",
591             "what",
592             "whatever",
593             "when",
594             "whence",
595             "whenever",
596             "where",
597             "whereafter",
598             "whereas",
599             "whereby",
600             "wherein",
601             "whereupon",
602             "wherever",
603             "whether",
604             "which",
605             "while",
606             "whither",
607             "who",
608             "whoever",
609             "whole",
610             "whom",
611             "whose",
612             "why",
613             "will",
614             "willing",
615             "wish",
616             "with",
617             "within",
618             "without",
619             "wonder",
620             "would",
621             "would",
622             "x",
623             "y",
624             "yes",
625             "yet",
626             "you",
627             "your",
628             "yours",
629             "yourself",
630             "yourselves",
631             "z",
632             "zero" };
633
634 }
635
Popular Tags