KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > impl > SequenceStore


1 /*
2  * SequenceStore.java: string, comment and special sequence handling in tokenizers
3  *
4  * Copyright (C) 2003, 2004 Heiko Blau
5  *
6  * This file belongs to the JTopas Library.
7  * JTopas is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your
10  * option) any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with JTopas. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * Contact:
28  * email: heiko@susebox.de
29  */

30
31 package de.susebox.jtopas.impl;
32
33 //-----------------------------------------------------------------------------
34
// Imports
35
//
36
import java.util.Iterator JavaDoc;
37 import java.util.TreeMap JavaDoc;
38 import java.util.NoSuchElementException JavaDoc;
39
40 import de.susebox.java.lang.ExtRuntimeException;
41
42 import de.susebox.jtopas.Token;
43 import de.susebox.jtopas.TokenizerProperty;
44 import de.susebox.jtopas.TokenizerProperties;
45 import de.susebox.jtopas.TokenizerException;
46
47 import de.susebox.jtopas.spi.SequenceHandler;
48 import de.susebox.jtopas.spi.KeywordHandler;
49 import de.susebox.jtopas.spi.DataProvider;
50
51
52 //-----------------------------------------------------------------------------
53
// Class SequenceStore
54
//
55

56 /**
57  * This class is used by {@link de.susebox.jtopas.StandardTokenizerProperties}
58  * to store and search special sequences, comments and strings as well as
59  * keywords. The class is not suitable for standalone use since it does not check
60  * parameters for <code>null</code> values, assumes a thread-safe context etc.
61  *
62  * @see de.susebox.jtopas.StandardTokenizerProperties
63  * @see de.susebox.jtopas.spi.SequenceHandler
64  * @author Heiko Blau
65  */

66 public class SequenceStore implements SequenceHandler, KeywordHandler {
67   
68   //---------------------------------------------------------------------------
69
// Constants
70
//
71

72   /**
73    * This number is the size of the array that is directly indexed by letters
74    */

75   public static char DIRECT_INDEX_COUNT = 256;
76   
77   
78   //---------------------------------------------------------------------------
79
// Constructors
80
//
81

82   /**
83    * The constructor initializes a <code>SequenceStore</code> with the given
84    * comparision policy (prefix comparison).
85    *
86    * @param useExactLength if <code>true</code> search only for a property that
87    * has the length of {@link de.susebox.jtopas.spi.DataProvider#getLength}
88    */

89   public SequenceStore(boolean useExactLength) {
90     _useExactLength = useExactLength;
91     _maxLength = 0;
92     _asciiArray = new PropertyList[DIRECT_INDEX_COUNT];
93     _nonASCIIMap = new TreeMap JavaDoc();
94   }
95
96   
97   //---------------------------------------------------------------------------
98
// Methods of the SequenceHandler interface
99
//
100

101   /**
102    * This method returns <code>true</code> if there are any special sequences,
103    * strings or comments registered in this instance. See the
104    * {@link de.susebox.jtopas.spi.SequenceHandler} interface for details.
105    *
106    * @return <code>true</code> if there are any special sequences, strings or
107    * comments available, <code>false</code> otherwise.
108    */

109   public boolean hasSequenceCommentOrString() {
110     return _maxLength > 0;
111   }
112   
113   /**
114    * This method checks if a given range of data starts with a special sequence,
115    * a comment or a string. See {@link de.susebox.jtopas.spi.SequenceHandler} for
116    * details.
117    *
118    * @param dataProvider the source to get the data range from
119    * @return a {@link de.susebox.jtopas.TokenizerProperty} if a special sequence,
120    * comment or string could be detected, <code>null</code> otherwise
121    * @throws TokenizerException generic exception
122    * @throws NullPointerException if no {@link DataProvider} is given
123    */

124   public TokenizerProperty startsWithSequenceCommentOrString(DataProvider dataProvider)
125     throws TokenizerException, NullPointerException JavaDoc
126   {
127     // only if characters are available
128
if (dataProvider.getLength() > 0) {
129       int len = dataProvider.getLength();
130       char startChar = getStartChar(dataProvider.getCharAt(0));
131       PropertyList list = getList(startChar);
132
133       while (list != null) {
134         TokenizerProperty prop = list._property;
135         String JavaDoc image = prop.getImages()[0];
136         int imageLen = image.length();
137
138         // compare only if the enough data is available
139
if (_useExactLength && imageLen < len) {
140           break; // dont check shorter properties
141
} else if (imageLen <= len && comparePrefix(image, dataProvider, 1) == 0) {
142           return prop; // single point of success
143
}
144         list = list._next;
145       }
146     }
147     
148     // not found
149
return null;
150   }
151
152   /**
153    * This method returns the length of the longest special sequence, comment or
154    * string prefix that is known to this <code>SequenceStore</code>. See
155    * {@link de.susebox.jtopas.spi.SequenceHandler} for details.
156    *
157    * @return the number of characters needed in the worst case to identify a
158    * special sequence
159    */

160   public int getSequenceMaxLength() {
161     return _maxLength;
162   }
163
164   //---------------------------------------------------------------------------
165
// Methods of the KeywordHandler interface
166
//
167

168   /**
169    * This method returns <code>true</code> if there are any keywords registered
170    * in this instance. See the {@link de.susebox.jtopas.spi.KeywordHandler}
171    * interface for details.
172    *
173    * @return <code>true</code> if there are any keywords available,
174    * <code>false</code> otherwise.
175    */

176   public boolean hasKeywords() {
177     // this classis is either used to store special sequences or keywords.
178
return hasSequenceCommentOrString();
179   }
180   
181   /**
182    * This method checks if the given data form a keyword.
183    * See {@link de.susebox.jtopas.spi.KeywordHandler} for details.
184    *
185    * @param dataProvider the source to get the data range from
186    * @return a {@link de.susebox.jtopas.TokenizerProperty} if keyword has been found,
187    * <code>null</code> otherwise
188    * @throws TokenizerException generic exception
189    * @throws NullPointerException if no {@link DataProvider} is given
190    */

191   public TokenizerProperty isKeyword(DataProvider dataProvider) throws TokenizerException, NullPointerException JavaDoc {
192     return startsWithSequenceCommentOrString(dataProvider);
193   }
194   
195   
196   //---------------------------------------------------------------------------
197
// Implementation
198
//
199

200   /**
201    * The method returns the normalized start character. This default implementation
202    * returns the given character itself, for case-insensitive handling see the
203    * derived class {@link NoCaseSequenceStore}.
204    *
205    * @param startChar a not normalized start character
206    * @return the normalized start character
207    */

208   protected char getStartChar(char startChar) {
209     return startChar;
210   }
211   
212   /**
213    * Addingt or replacing a special sequence, comment or string.
214    *
215    * @param property the description of the new sequence
216    * @return the previously version of the given property or <code>null</code>
217    */

218   public TokenizerProperty addSpecialSequence(TokenizerProperty property) {
219     String JavaDoc image = property.getImages()[0];
220     int length = image.length();
221     char startChar = getStartChar(image.charAt(0));
222     
223     if (_maxLength < length) {
224       _maxLength = length;
225     }
226     if (startChar >= 0 && startChar < DIRECT_INDEX_COUNT) {
227       return insertDirect(startChar, property);
228     } else {
229       return insertMapped(startChar, property);
230     }
231   }
232
233   /**
234    * Removing a special sequence from the store. If the special sequence denoted
235    * by the given string does not exist the method returns <code>null</code>.
236    *
237    * @param image sequence to remove
238    * @return the removed property or <code>null</code> if the sequence was not found
239    */

240   public TokenizerProperty removeSpecialSequence(String JavaDoc image) {
241     return searchString(image, true);
242   }
243   
244   /**
245    * Get the full description of a special sequence property.
246    *
247    * @param image sequence to find
248    * @return the full sequence description or <code>null</code>
249    */

250   public TokenizerProperty getSpecialSequence(String JavaDoc image) {
251     return searchString(image, false);
252   }
253
254   /**
255    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
256    * objects.
257    *
258    * @param type type of special sequence like {@link de.susebox.jtopas.Token#STRING}
259    * @return enumeration of {@link TokenizerProperty} objects
260    */

261   public Iterator JavaDoc getSpecialSequences(int type) {
262     return new SpecialSequencesIterator(this, type);
263   }
264   
265   /**
266    * Addingt or replacing a keyword.
267    *
268    * @param property the description of the new keyword
269    * @return the previously version of the given property or <code>null</code>
270    */

271   public TokenizerProperty addKeyword(TokenizerProperty property) {
272     return addSpecialSequence(property);
273   }
274   
275   /**
276    * Removing a special sequence from the store. If the special sequence denoted
277    * by the given string does not exist the method returns <code>null</code>.
278    *
279    * @param image sequence to remove
280    * @return the removed property or <code>null</code> if the sequence was not found
281    */

282   public TokenizerProperty removeKeyword(String JavaDoc image) {
283     return removeSpecialSequence(image);
284   }
285   
286   /**
287    * Get the full description of a keyword property.
288    *
289    * @param image keyword candidate to look for
290    * @return the full keyword description or <code>null</code>
291    */

292   public TokenizerProperty getKeyword(String JavaDoc image) {
293     return getSpecialSequence(image);
294   }
295
296   /**
297    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
298    * objects describing keywords.
299    *
300    * @return enumeration of {@link TokenizerProperty} objects representing
301    * keywords
302    */

303   public Iterator JavaDoc getKeywords() {
304     return getSpecialSequences(Token.KEYWORD);
305   }
306
307   /**
308    * Get the property list for a given character.
309    *
310    * @param startChar start character
311    * @return list of properties starting with the given character
312    */

313   private PropertyList getList(char startChar) {
314     // get the list: here we try a very fast access for the ASCII characters
315
// via unchecked access and caught exceptions
316
PropertyList list;
317
318     try {
319       // direct indexed sequence
320
list = _asciiArray[startChar];
321     } catch (IndexOutOfBoundsException JavaDoc ex) {
322       // mapped sequence
323
list = (PropertyList)_nonASCIIMap.get(new Character JavaDoc(startChar));
324     }
325     return list;
326   }
327
328
329   /**
330    * Search a string in the given list. Optionally, remove it. Removal may also
331    * reorganize the indexed array or non-ASCII map.
332    *
333    * @param image sequence to search
334    * @param removeIt if <code>true</code> remove a found sequence from the list
335    * @return the property or <code>null</code> if the sequence was not found
336    */

337   private TokenizerProperty searchString(String JavaDoc image, boolean removeIt) {
338     char startChar = getStartChar(image.charAt(0));
339     PropertyList list = getList(startChar);
340     PropertyList prev = null;
341     
342     while (list != null) {
343       TokenizerProperty prop = list._property;
344       String JavaDoc img = prop.getImages()[0];
345       int res = compare(img, image, 1);
346
347       if (res == 0) {
348         if (removeIt) {
349           if (prev != null) {
350             prev._next = list._next;
351           } else {
352             list = list._next;
353             if (startChar >= 0 && startChar < DIRECT_INDEX_COUNT) {
354               _asciiArray[startChar] = list;
355             } else if (list != null) {
356               _nonASCIIMap.put(new Character JavaDoc(startChar), list);
357             } else {
358               _nonASCIIMap.remove(new Character JavaDoc(startChar));
359             }
360           }
361         }
362         return prop;
363       } else if (res < 0) {
364         break;
365       }
366       prev = list;
367       list = list._next;
368     }
369     return null;
370   }
371   
372   
373   /**
374    * Insert a new property into the direct-index array.
375    *
376    * @param property the description of the new sequence
377    * @return the previously version of the given property or <code>null</code>
378    */

379   private TokenizerProperty insertDirect(char startChar, TokenizerProperty property) {
380     // the first element with the given start letter ...
381
if (_asciiArray[startChar] == null) {
382       _asciiArray[startChar] = new PropertyList(property);
383       return null;
384
385     // ... or inserting/replacing in an existing list
386
} else {
387       return putIntoList(_asciiArray[startChar], property);
388     }
389   }
390     
391
392   /**
393    * Insert a new property into the hash table for real unicode letters.
394    *
395    * @param property the description of the new sequence
396    * @return the previously version of the given property or <code>null</code>
397    */

398   private TokenizerProperty insertMapped(char startChar, TokenizerProperty property) {
399     Character JavaDoc key = new Character JavaDoc(getStartChar(startChar));
400     PropertyList list = (PropertyList)_nonASCIIMap.get(key);
401     
402     if (list == null) {
403       _nonASCIIMap.put(key, new PropertyList(property));
404       return null;
405     } else {
406       return putIntoList(list, property);
407     }
408   }
409
410   
411   /**
412    * Insert/replace a property in a property list. The list is ordered by string
413    * comparison. This is important for the search in {@link #startsWithSequenceCommentOrString}.
414    *
415    * @param list insert or replace in this list
416    * @param property the description of the new sequence
417    * @return the previously version of the given property or <code>null</code>
418    */

419   private TokenizerProperty putIntoList(PropertyList list, TokenizerProperty property) {
420     String JavaDoc newImage = property.getImages()[0];
421     PropertyList prev;
422
423     do {
424       TokenizerProperty prop = list._property;
425       String JavaDoc image = prop.getImages()[0];
426       int res = compare(image, newImage, 1);
427
428       if (res == 0) {
429         list._property = property;
430         return prop;
431       } else if (res < 0) {
432         list._next = new PropertyList(prop, list._next);
433         list._property = property;
434         return null;
435       }
436       prev = list;
437     } while ((list = prev._next) != null);
438     
439     // Append element
440
prev._next = new PropertyList(property);
441     return null;
442   }
443
444   
445   /**
446    * Compare method for sequences. Longer Strings are greater, shorter are lesser.
447    * Strings with equal lengths are compared in the usual way.
448    *
449    * @param thisImage first string to compare
450    * @param thatImage second string to compare
451    * @param fromIndex start comparison from this index
452    * @return 0 if equal, < 0 if thisImage < thatImage, > 0 otherwise
453    */

454   private int compare(String JavaDoc thisImage, String JavaDoc thatImage, int fromIndex) {
455     int thisLength = thisImage.length();
456     int thatLength = thatImage.length();
457     
458     if (thisLength != thatLength) {
459       return thisLength - thatLength;
460     }
461     
462     while (fromIndex < thisLength) {
463       int res = compare(thisImage.charAt(fromIndex), thatImage.charAt(fromIndex));
464
465       if (res != 0) {
466         return res;
467       }
468       fromIndex++;
469     }
470     return 0;
471   }
472     
473   /**
474    * Compare method for a string and a character array. The method assumes that
475    * the character array holds at least as many characters as the given string.
476    *<br>
477    * See {@link #compare} for details how the comparison is performed.
478    *
479    * @param prefix string to compare
480    * @param dataProvider source to get the other characters to compare
481    * @param offset start comparison from this index
482    * @return 0 if equal, < 0 if thisImage < thatImage, > 0 otherwise
483    */

484   private int comparePrefix(String JavaDoc prefix, DataProvider dataProvider, int offset) {
485     while (offset < prefix.length()) {
486       int res = compare(prefix.charAt(offset), dataProvider.getCharAt(offset));
487
488       if (res != 0) {
489         return res;
490       }
491       offset++;
492     }
493     return 0;
494   }
495   
496   /**
497    * Compare tho characters. Returns the difference of the to characters, 0 if
498    * they are equal. The default implementation compares case-sensitive, for the
499    * lexicographical solution see {@link NoCaseSequenceStore}..............
500    *
501    * @param char1 first character to compare
502    * @param char2 first character to compare
503    * @return 0 if equal, < 0 if char1 < char2, > 0 otherwise
504    */

505   protected int compare(char char1, char char2) {
506     return char1 - char2;
507   }
508   
509   
510   
511   //---------------------------------------------------------------------------
512
// Inner class
513
//
514

515   /**
516    * List element for equaly starting special sequences.
517    */

518   final class PropertyList {
519
520     /**
521      * Constructor taking the {@link TokenizerProperty} as its single argument.
522      *
523      * @param property a {@link TokenizerProperty} instance
524      */

525     PropertyList(TokenizerProperty property) {
526       this(property, null);
527     }
528
529     /**
530      * Constructor taking a {@link TokenizerProperty} and the next list element.
531      * For the next element, <code>null</code> is a valid value.
532      *
533      * @param property a {@link TokenizerProperty} instance
534      * @param next the following {@link PropertyList} element
535      */

536     PropertyList(TokenizerProperty property, PropertyList next) {
537       _property = property;
538       _next = next;
539     }
540
541     // members
542
public PropertyList _next;
543     public TokenizerProperty _property;
544   }
545   
546
547   /**
548    * Iterator for comments, strings and ordinary special sequences.
549    * Instances of this inner class are returned when a call to {@link #getSpecialSequences}
550    * is done. Each element of the enumeration contains a {@link TokenizerProperty}
551    * element, that in turn has the comment, special sequence etc. together with
552    * its companion
553    */

554   final class SpecialSequencesIterator implements Iterator JavaDoc {
555
556     /**
557      * constructor taking the calling <code>Tokenizer</code> and the type of the
558      * {@link TokenizerProperty}. If the type is 0 then special sequences, line and
559      * block comments are returned in one iterator
560      *
561      * @param parent the calling tokenizer
562      * @param type type of the <code>TokenizerProperty</code>
563      */

564     public SpecialSequencesIterator(SequenceStore parent, int type) {
565       _type = type;
566       _parent = parent;
567     }
568
569     /**
570      * Checking for the next element in a special sequence list, that has the
571      * required type. This method is the one that ultimately decides if there are
572      * more elements or not.
573      *
574      * @return <code>true</code> if there is a matching {@link TokenizerProperty}
575      * element, <code>false</code> otherwise
576      */

577     private boolean listHasNext() {
578       while (_currentList != null) {
579         if (_type == 0 || _currentList._property.getType() == _type) {
580           return true;
581         }
582         _currentList = _currentList._next;
583       }
584       return false;
585     }
586
587     /**
588      * The well known method from the {@link java.util.Iterator} interface.
589      *
590      * @return <code>true</code> if there are more {@link TokenizerProperty}
591      * elements, <code>false</code> otherwise
592      */

593     public boolean hasNext() {
594       // simple: check the current list for a successor
595
if (listHasNext()) {
596         return true;
597       }
598
599       // already reached the tree map iterator ?
600
if (_mapIterator != null) {
601         while (_mapIterator.hasNext()) {
602           _currentList = (PropertyList)_mapIterator.next();
603           if (listHasNext()) {
604             return true;
605           }
606         }
607         
608       // ... or still the direct index array ?
609
} else {
610         if (_parent._asciiArray != null) {
611           while (++_currentIndex < DIRECT_INDEX_COUNT) {
612             if ((_currentList = _parent._asciiArray[_currentIndex]) != null) {
613               if (listHasNext()) {
614                 return true;
615               }
616             }
617           }
618         }
619         if (_parent._nonASCIIMap != null) {
620           _mapIterator = _parent._nonASCIIMap.values().iterator();
621           _currentList = null;
622           return hasNext();
623         }
624       }
625
626       // no (more) sequences
627
return false;
628     }
629
630     /**
631      * Retrieve the next {@link TokenizerProperty} in this enumeration.
632      *
633      * @return a {@link TokenizerProperty} of the desired type or <code>null</code>
634      * @throws NoSuchElementException if there is no more element in this iterator
635      */

636     public Object JavaDoc next() throws NoSuchElementException JavaDoc {
637       if (! hasNext()) {
638         throw new NoSuchElementException JavaDoc();
639       }
640       
641       _currentElem = _currentList;
642       _currentList = _currentList._next;
643       return _currentElem._property;
644     }
645
646     /**
647      * Remove the current special sequence entry from the collection. This is an
648      * alternative to {@link Tokenizer#removeSpecialSequence}.
649      *
650      * @throws IllegalStateExcpetion if {@link #next} has not been called before or
651      * <code>remove</code> has been called already after the last <code>next</code>.
652      */

653     public void remove() throws IllegalStateException JavaDoc {
654       // if current element is not set
655
if (_currentElem == null) {
656         throw new IllegalStateException JavaDoc();
657       }
658
659       // remove current element
660
TokenizerProperty prop = _currentElem._property;
661
662       _currentElem = null;
663       _parent.searchString(prop.getImages()[0], true);
664     }
665
666
667     // members
668
private SequenceStore _parent = null;
669     private int _type = Token.UNKNOWN;
670     private Iterator JavaDoc _mapIterator = null;
671     private int _currentIndex = -1;
672     private PropertyList _currentList = null;
673     private PropertyList _currentElem = null;
674   }
675
676
677   //---------------------------------------------------------------------------
678
// Members
679
//
680
private PropertyList[] _asciiArray;
681   private TreeMap JavaDoc _nonASCIIMap = null;
682   private int _maxLength;
683   private boolean _useExactLength;
684 }
685
Popular Tags