HarmoniseAnalyzer


1   /*
2    * The contents of this file are subject to the 
3    * Mozilla Public License Version 1.1 (the "License"); 
4    * you may not use this file except in compliance with the License. 
5    * You may obtain a copy of the License at http://www.mozilla.org/MPL/
6    *
7    * Software distributed under the License is distributed on an "AS IS"
8    * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. 
9    * See the License for the specific language governing rights and 
10   * limitations under the License.
11   *
12   * The Initial Developer of the Original Code is Simulacra Media Ltd.
13   * Portions created by Simulacra Media Ltd are Copyright (C) Simulacra Media Ltd, 2004.
14   *
15   * All Rights Reserved.
16   *
17   * Contributor(s):
18   */
19  package org.openharmonise.rm.search;
20  import java.io.*;
21  import java.util.*;
22  
23  import org.apache.lucene.analysis.*;
24  
25  /**
26   * <p>Extension of Lucene <code>Analyzer</code> for use with Harmonise.
27   * 
28   * <p>This is a simple extension of the Lucene <code>Analyzer</code> class to meet the 
29   * requirements for more sophisticated text indexing, in particular for use with the Becta
30   * VocabManager application.</p>
31   * 
32   * <p>It provides an implementation that uses Lucene's built in implementation of the Porter 
33   * stemming algorithm based on lower case tokenization of the input text.</p>
34   * 
35   * <p><em>Note:</em> this algorithm is designed specifically for English language text. The stemmer
36   * works uses basic knowledge of English morphology. It is <em>not</em> suitable for other 
37   * human languages!</p> 
38   * 
39   * @author John King
40   * @version $Revision: 1.3 $
41   *
42   */
43  public class HarmoniseAnalyzer extends Analyzer {
44  
45      private Set stopWordsSet;
46  
47      public HarmoniseAnalyzer() {
48          stopWordsSet = StopFilter.makeStopSet(SMART_STOP_WORDS);
49      }
50  
51      /**
52       * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
53       */
54      public TokenStream tokenStream(String   fieldName, Reader reader) {
55          return new PorterStemFilter(new StopFilter(new LowerCaseTokenizer(reader), stopWordsSet));
56      }
57      
58      /**
59       * Returns <code>true</code> if the specified <code>String</code>
60       * contains a stop word
61       * 
62       * @param sArg a <code>String</code> to be inspected for stop words
63       * @return<code>true</code> if the specified <code>String</code>
64       * contains a stop word
65       */
66      public static boolean containsStopWord(String   sArg) {
67          boolean bContainsStop = false;
68          
69          List list = Arrays.asList(SMART_STOP_WORDS);
70          
71          StringTokenizer tokenizer = new StringTokenizer(sArg);
72          
73          while (tokenizer.hasMoreTokens()) {
74              String   sToken = tokenizer.nextToken();
75              
76              if(list.contains(sToken) == true) {
77                  bContainsStop = true;
78                  break;
79              }
80          }
81          
82          return bContainsStop;
83      }
84      
85      /**
86       * Returns the array of stop words used by this <code>Analyzer</code>
87       * 
88       * @return the array of stop words used by this <code>Analyzer</code>
89       */
90      public static String  [] getStopWords() {
91          return SMART_STOP_WORDS;
92      }
93  
94      /*
95       * The list of stop words below is taken from http://www.onjava.com/onjava/2003/01/15/examples/EnglishStopWords.txt
96       * as suggested by Otis Gospodnetic's article at http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html?page=1
97       * 
98       * All licensed as per Apache License, and therefore ok for inclusion here.
99       */
100 
101     //  Contributed to Lucene on 2002-09-21 by "John Caron" <caron@unidata.ucar.edu>
102     //
103     //  See also:
104     //    org.apache.lucene.analysis.StopFilter.makeStopTable()
105     //    ftp://ftp.cs.cornell.edu/pub/smart/
106 
107     private static final String   SMART_STOP_WORDS[] =
108         {
109             "a",
110             "able",
111             "about",
112             "above",
113             "according",
114             "accordingly",
115             "across",
116             "actually",
117             "after",
118             "afterwards",
119             "again",
120             "against",
121             "all",
122             "allow",
123             "allows",
124             "almost",
125             "alone",
126             "along",
127             "already",
128             "also",
129             "although",
130             "always",
131             "am",
132             "among",
133             "amongst",
134             "an",
135             "and",
136             "another",
137             "any",
138             "anybody",
139             "anyhow",
140             "anyone",
141             "anything",
142             "anyway",
143             "anyways",
144             "anywhere",
145             "apart",
146             "appear",
147             "appreciate",
148             "appropriate",
149             "are",
150             "around",
151             "as",
152             "aside",
153             "ask",
154             "asking",
155             "associated",
156             "at",
157             "available",
158             "away",
159             "awfully",
160             "b",
161             "be",
162             "became",
163             "because",
164             "become",
165             "becomes",
166             "becoming",
167             "been",
168             "before",
169             "beforehand",
170             "behind",
171             "being",
172             "believe",
173             "below",
174             "beside",
175             "besides",
176             "best",
177             "better",
178             "between",
179             "beyond",
180             "both",
181             "brief",
182             "but",
183             "by",
184             "c",
185             "came",
186             "can",
187             "cannot",
188             "cant",
189             "cause",
190             "causes",
191             "certain",
192             "certainly",
193             "changes",
194             "clearly",
195             "co",
196             "com",
197             "come",
198             "comes",
199             "concerning",
200             "consequently",
201             "consider",
202             "considering",
203             "contain",
204             "containing",
205             "contains",
206             "corresponding",
207             "could",
208             "course",
209             "currently",
210             "d",
211             "definitely",
212             "described",
213             "despite",
214             "did",
215             "different",
216             "do",
217             "does",
218             "doing",
219             "done",
220             "down",
221             "downwards",
222             "during",
223             "e",
224             "each",
225             "edu",
226             "eg",
227             "eight",
228             "either",
229             "else",
230             "elsewhere",
231             "enough",
232             "entirely",
233             "especially",
234             "et",
235             "etc",
236             "even",
237             "ever",
238             "every",
239             "everybody",
240             "everyone",
241             "everything",
242             "everywhere",
243             "ex",
244             "exactly",
245             "example",
246             "except",
247             "f",
248             "far",
249             "few",
250             "fifth",
251             "first",
252             "five",
253             "followed",
254             "following",
255             "follows",
256             "for",
257             "former",
258             "formerly",
259             "forth",
260             "four",
261             "from",
262             "further",
263             "furthermore",
264             "g",
265             "get",
266             "gets",
267             "getting",
268             "given",
269             "gives",
270             "go",
271             "goes",
272             "going",
273             "gone",
274             "got",
275             "gotten",
276             "greetings",
277             "h",
278             "had",
279             "happens",
280             "hardly",
281             "has",
282             "have",
283             "having",
284             "he",
285             "hello",
286             "help",
287             "hence",
288             "her",
289             "here",
290             "hereafter",
291             "hereby",
292             "herein",
293             "hereupon",
294             "hers",
295             "herself",
296             "hi",
297             "him",
298             "himself",
299             "his",
300             "hither",
301             "hopefully",
302             "how",
303             "howbeit",
304             "however",
305             "i",
306             "ie",
307             "if",
308             "ignored",
309             "immediate",
310             "in",
311             "inasmuch",
312             "inc",
313             "indeed",
314             "indicate",
315             "indicated",
316             "indicates",
317             "inner",
318             "insofar",
319             "instead",
320             "into",
321             "inward",
322             "is",
323             "it",
324             "its",
325             "itself",
326             "j",
327             "just",
328             "k",
329             "keep",
330             "keeps",
331             "kept",
332             "know",
333             "knows",
334             "known",
335             "l",
336             "last",
337             "lately",
338             "later",
339             "latter",
340             "latterly",
341             "least",
342             "less",
343             "lest",
344             "let",
345             "like",
346             "liked",
347             "likely",
348             "little",
349             "look",
350             "looking",
351             "looks",
352             "ltd",
353             "m",
354             "mainly",
355             "many",
356             "may",
357             "maybe",
358             "me",
359             "mean",
360             "meanwhile",
361             "merely",
362             "might",
363             "more",
364             "moreover",
365             "most",
366             "mostly",
367             "much",
368             "must",
369             "my",
370             "myself",
371             "n",
372             "name",
373             "namely",
374             "nd",
375             "near",
376             "nearly",
377             "necessary",
378             "need",
379             "needs",
380             "neither",
381             "never",
382             "nevertheless",
383             "new",
384             "next",
385             "nine",
386             "no",
387             "nobody",
388             "non",
389             "none",
390             "noone",
391             "nor",
392             "normally",
393             "not",
394             "nothing",
395             "novel",
396             "now",
397             "nowhere",
398             "o",
399             "obviously",
400             "of",
401             "off",
402             "often",
403             "oh",
404             "ok",
405             "okay",
406             "old",
407             "on",
408             "once",
409             "one",
410             "ones",
411             "only",
412             "onto",
413             "or",
414             "other",
415             "others",
416             "otherwise",
417             "ought",
418             "our",
419             "ours",
420             "ourselves",
421             "out",
422             "outside",
423             "over",
424             "overall",
425             "own",
426             "p",
427             "particular",
428             "particularly",
429             "per",
430             "perhaps",
431             "placed",
432             "please",
433             "plus",
434             "possible",
435             "presumably",
436             "probably",
437             "provides",
438             "q",
439             "que",
440             "quite",
441             "qv",
442             "r",
443             "rather",
444             "rd",
445             "re",
446             "really",
447             "reasonably",
448             "regarding",
449             "regardless",
450             "regards",
451             "relatively",
452             "respectively",
453             "right",
454             "s",
455             "said",
456             "same",
457             "saw",
458             "say",
459             "saying",
460             "says",
461             "second",
462             "secondly",
463             "see",
464             "seeing",
465             "seem",
466             "seemed",
467             "seeming",
468             "seems",
469             "seen",
470             "self",
471             "selves",
472             "sensible",
473             "sent",
474             "serious",
475             "seriously",
476             "seven",
477             "several",
478             "shall",
479             "she",
480             "should",
481             "since",
482             "six",
483             "so",
484             "some",
485             "somebody",
486             "somehow",
487             "someone",
488             "something",
489             "sometime",
490             "sometimes",
491             "somewhat",
492             "somewhere",
493             "soon",
494             "sorry",
495             "specified",
496             "specify",
497             "specifying",
498             "still",
499             "sub",
500             "such",
501             "sup",
502             "sure",
503             "t",
504             "take",
505             "taken",
506             "tell",
507             "tends",
508             "th",
509             "than",
510             "thank",
511             "thanks",
512             "thanx",
513             "that",
514             "thats",
515             "the",
516             "their",
517             "theirs",
518             "them",
519             "themselves",
520             "then",
521             "thence",
522             "there",
523             "thereafter",
524             "thereby",
525             "therefore",
526             "therein",
527             "theres",
528             "thereupon",
529             "these",
530             "they",
531             "think",
532             "third",
533             "this",
534             "thorough",
535             "thoroughly",
536             "those",
537             "though",
538             "three",
539             "through",
540             "throughout",
541             "thru",
542             "thus",
543             "to",
544             "together",
545             "too",
546             "took",
547             "toward",
548             "towards",
549             "tried",
550             "tries",
551             "truly",
552             "try",
553             "trying",
554             "twice",
555             "two",
556             "u",
557             "un",
558             "under",
559             "unfortunately",
560             "unless",
561             "unlikely",
562             "until",
563             "unto",
564             "up",
565             "upon",
566             "us",
567             "use",
568             "used",
569             "useful",
570             "uses",
571             "using",
572             "usually",
573             "uucp",
574             "v",
575             "value",
576             "various",
577             "very",
578             "via",
579             "viz",
580             "vs",
581             "w",
582             "want",
583             "wants",
584             "was",
585             "way",
586             "we",
587             "welcome",
588             "well",
589             "went",
590             "were",
591             "what",
592             "whatever",
593             "when",
594             "whence",
595             "whenever",
596             "where",
597             "whereafter",
598             "whereas",
599             "whereby",
600             "wherein",
601             "whereupon",
602             "wherever",
603             "whether",
604             "which",
605             "while",
606             "whither",
607             "who",
608             "whoever",
609             "whole",
610             "whom",
611             "whose",
612             "why",
613             "will",
614             "willing",
615             "wish",
616             "with",
617             "within",
618             "without",
619             "wonder",
620             "would",
621             "would",
622             "x",
623             "y",
624             "yes",
625             "yet",
626             "you",
627             "your",
628             "yours",
629             "yourself",
630             "yourselves",
631             "z",
632             "zero" };
633 
634 }
635
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags