KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > StandardTokenizerProperties


1 /*
2  * StandardTokenizerProperties.java: general-use TokenizerProperties implementation
3  *
4  * Copyright (C) 2002 Heiko Blau
5  *
6  * This file belongs to the JTopas Library.
7  * JTopas is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your
10  * option) any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with JTopas. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * Contact:
28  * email: heiko@susebox.de
29  */

30
31 package de.susebox.jtopas;
32
33 //-----------------------------------------------------------------------------
34
// Imports
35
//
36
import java.util.Arrays JavaDoc;
37 import java.util.ArrayList JavaDoc;
38 import java.util.Map JavaDoc;
39 import java.util.HashMap JavaDoc;
40 import java.util.Iterator JavaDoc;
41 import java.util.NoSuchElementException JavaDoc;
42
43 import de.susebox.java.lang.ExtRuntimeException;
44 import de.susebox.java.lang.ExtUnsupportedOperationException;
45 import de.susebox.java.lang.ExtIllegalArgumentException;
46
47 import de.susebox.jtopas.spi.DataMapper;
48 import de.susebox.jtopas.spi.DataProvider;
49 import de.susebox.jtopas.spi.PatternHandler;
50
51 import de.susebox.jtopas.impl.PatternMatcher;
52 import de.susebox.jtopas.impl.SequenceStore;
53 import de.susebox.jtopas.impl.NoCaseSequenceStore;
54
55
56 //-----------------------------------------------------------------------------
57
// Class StandardTokenizerProperties
58
//
59

60 /**<p>
61  * The class <code>StandardTokenizerProperties</code> provides a simple implementation
62  * of the {@link TokenizerProperties} interface for use in most situations.
63  *</p><p>
64  * Note that this class takes advantage of JTopas features that use Java 1.4 or
65  * higher. It can still be used in older environments but not compiled with JDK
66  * versions below 1.4!
67  *</p>
68  *
69  * @see TokenizerProperties
70  * @see Tokenizer
71  * @author Heiko Blau
72  */

73 public class StandardTokenizerProperties
74   extends AbstractTokenizerProperties
75   implements TokenizerProperties, DataMapper
76 {
77   
78   //---------------------------------------------------------------------------
79
// Properties
80
//
81

82   /**
83    * Maximum length of a non-free pattern match. These are patterns that dont
84    * have the {@link TokenizerProperties#F_FREE_PATTERN} flag set. A common
85    * example are number patterns.
86    */

87   public static final short MAX_NONFREE_MATCHLEN = 1024;
88   
89   
90   //---------------------------------------------------------------------------
91
// Constructors
92
//
93

94   /**
95    * Default constructor that intitializes an instance with the default whitespaces
96    * and separator sets. {@link Tokenizer} instances using this <code>StandardTokenizerProperties</code>
97    * object, split text between spaces, tabs and line ending sequences as well
98    * as between punctuation characters.
99    */

100   public StandardTokenizerProperties() {
101     this(0);
102   }
103
104   /**
105    * This constructor takes the control flags to be used. It is a shortcut to:
106    * <pre>
107    * TokenizerProperties props = new StandardTokenizerProperties();
108    *
109    * props.setParseFlags(flags);
110    * </pre>
111    * See the {@link TokenizerProperties} interface for the supported flags.
112    *<br>
113    * The {@link TokenizerProperties#DEFAULT_WHITESPACES} and
114    * {@link TokenizerProperties#DEFAULT_SEPARATORS} are used for whitespace and
115    * separator handling if no explicit calls to {@link #setWhitespaces} and
116    * {@link #setSeparators} will follow subsequently.
117    *
118    * @param flags tokenizer control flags
119    * @see #setParseFlags
120    */

121   public StandardTokenizerProperties(int flags) {
122     this(flags, DEFAULT_WHITESPACES, DEFAULT_SEPARATORS);
123   }
124   
125   
126   /**
127    * This constructor takes the whitespace and separator sets to be used. It is
128    * a shortcut to:
129    * <pre>
130    * TokenizerProperties props = new StandardTokenizerProperties();
131    *
132    * props.setWhitespaces(ws);
133    * props.setSeparators(sep);
134    * </pre>
135    *
136    * @param flags tokenizer control flags
137    * @param whitespaces the whitespace set
138    * @param separators the set of separating characters
139    * @see #setParseFlags
140    * @see #setWhitespaces
141    * @see #setSeparators
142    */

143   public StandardTokenizerProperties(int flags, String JavaDoc whitespaces, String JavaDoc separators) {
144     Arrays.fill(_charFlags, 0);
145     setParseFlags(flags);
146     setWhitespaces(whitespaces);
147     setSeparators(separators);
148   }
149   
150   
151   //---------------------------------------------------------------------------
152
// Abstract methods of the base class
153
//
154

155   /**
156    * Retrieving a property by a given type and image. See the method description
157    * in {@link AbstractTokenizerProperties} for details.
158    *
159    * @param type the type the returned property should have
160    * @param startImage the (starting) image
161    * @return the token description for the image or <code>null</code>
162    */

163   protected TokenizerProperty doGetProperty(int type, String JavaDoc startImage) {
164     TokenizerProperty prop = null;
165     
166     switch (type) {
167     case Token.KEYWORD:
168       if (_keywords[0] != null) {
169         prop = _keywords[0].getKeyword(startImage);
170       }
171       if (prop == null && _keywords[1] != null) {
172         prop = _keywords[1].getKeyword(startImage);
173       }
174       break;
175       
176     case Token.STRING:
177     case Token.LINE_COMMENT:
178     case Token.BLOCK_COMMENT:
179     case Token.SPECIAL_SEQUENCE:
180       if (_sequences[0] != null) {
181         prop = _sequences[0].getSpecialSequence(startImage);
182       }
183       if (prop == null && _sequences[1] != null) {
184         prop = _sequences[1].getSpecialSequence(startImage);
185       }
186       break;
187       
188     case Token.PATTERN:
189       for (int index = 0; index < _patterns.size(); ++index) {
190         PatternMatcher data = (PatternMatcher)_patterns.get(index);
191
192         prop = data.getProperty();
193         if (prop.getImages()[0].equals(startImage)) {
194           break;
195         }
196         prop = null;
197       }
198       break;
199
200     case Token.WHITESPACE:
201     case Token.SEPARATOR:
202     default:
203       throw new ExtIllegalArgumentException("Unsupported property type {0}. (Leading) image \"{1}\".",
204                                             new Object JavaDoc[] { new Integer JavaDoc(type), startImage } );
205     }
206
207     // either the required property or null
208
return prop;
209   }
210   
211   
212   /**
213    * Setting a new separator set. See the method description in
214    * {@link AbstractTokenizerProperties} for details.
215    *
216    * @param separators the set of separators including ranges
217    * @return the replaced separator set or <code>null</code>
218    */

219   protected String JavaDoc doSetSeparators(String JavaDoc separators) {
220     String JavaDoc oldValue;
221
222     // which separators should be set?
223
if ((_flags & Flags.F_NO_CASE) == 0) {
224       oldValue = (_separatorsCase.length() > 0) ? _separatorsCase : _separatorsNoCase;
225       _separatorsCase = separators;
226       _separatorsNoCase = "";
227     } else {
228       oldValue = (_separatorsNoCase.length() > 0) ? _separatorsNoCase : _separatorsCase;
229       _separatorsCase = "";
230       _separatorsNoCase = separators;
231     }
232
233     // mark seaparators in character table
234
putCharSet(oldValue, Token.SEPARATOR, false);
235     putCharSet(separators, Token.SEPARATOR, true);
236
237     // normalize the old value
238
if (oldValue == null || oldValue.length() == 0) {
239       return null;
240     } else {
241       return oldValue;
242     }
243   }
244   
245   /**
246    * Setting a new whitespace set. See the method description in
247    * {@link AbstractTokenizerProperties} for details.
248    *
249    * @param whitespaces the set of whitespaces including ranges
250    * @return the replaced whitespace set or <code>null</code>
251    */

252   protected String JavaDoc doSetWhitespaces(String JavaDoc whitespaces) {
253     // set the right whitespaces
254
String JavaDoc oldValue;
255
256     if ((_flags & Flags.F_NO_CASE) == 0) {
257       oldValue = (_whitespacesCase.length() > 0) ? _whitespacesCase : _whitespacesNoCase;
258       _whitespacesCase = whitespaces;
259       _whitespacesNoCase = "";
260     } else {
261       oldValue = (_whitespacesNoCase.length() > 0) ? _whitespacesNoCase : _whitespacesCase;
262       _whitespacesCase = "";
263       _whitespacesNoCase = whitespaces;
264     }
265
266     // mark whitespaces in character table
267
putCharSet(oldValue, Token.WHITESPACE, false);
268     putCharSet(whitespaces, Token.WHITESPACE, true);
269
270     // return changes
271
if (oldValue == null || oldValue.length() == 0) {
272       return null;
273     } else {
274       return oldValue;
275     }
276   }
277   
278   /**
279    * Registering a {@link TokenizerProperty}.
280    * See the method description in {@link AbstractTokenizerProperties}.
281    *
282    * @param property property to register
283    * @return the replaced property or <code>null</code>
284    */

285   protected TokenizerProperty doAddProperty(TokenizerProperty property) {
286     switch (property.getType()) {
287     case Token.STRING:
288     case Token.LINE_COMMENT:
289     case Token.BLOCK_COMMENT:
290     case Token.SPECIAL_SEQUENCE:
291       return addSpecialSequence(property);
292
293     case Token.KEYWORD:
294       return addKeyword(property);
295
296     case Token.PATTERN:
297       return addPattern(property);
298
299     case Token.WHITESPACE:
300     case Token.SEPARATOR:
301     default:
302       throw new ExtIllegalArgumentException("Unsupported property type {0}. (Leading) image \"{1}\".",
303                                             new Object JavaDoc[] { new Integer JavaDoc(property.getType()), property.getImages()[0] } );
304     }
305   }
306   
307   /**
308    * Deregistering a {@link TokenizerProperty} from the store.
309    * See the method description in {@link AbstractTokenizerProperties}.
310    *
311    * @param property property to remove
312    * @return the replaced property or <code>null</code>
313    */

314   protected TokenizerProperty doRemoveProperty(TokenizerProperty property) {
315     // removing property according to type
316
TokenizerProperty prop = null;
317     String JavaDoc image = property.getImages()[0];
318     
319     switch (property.getType()) {
320     case Token.LINE_COMMENT:
321     case Token.BLOCK_COMMENT:
322     case Token.STRING:
323     case Token.SPECIAL_SEQUENCE:
324       if (_sequences[0] != null) {
325         prop = _sequences[0].removeSpecialSequence(image);
326       }
327       if (prop == null && _sequences[1] != null) {
328         prop = _sequences[1].removeSpecialSequence(image);
329       }
330       break;
331
332     case Token.KEYWORD:
333       if (_keywords[0] != null) {
334         prop = _keywords[0].removeKeyword(image);
335       }
336       if (prop == null && _keywords[1] != null) {
337         prop = _keywords[1].removeKeyword(image);
338       }
339       break;
340
341     case Token.PATTERN:
342       for (int index = 0; index < _patterns.size(); ++index) {
343         PatternMatcher data = (PatternMatcher)_patterns.get(index);
344
345         prop = data.getProperty();
346         if (prop.getImages()[0].equals(image)) {
347           _patterns.remove(index);
348           break;
349         } else {
350           prop = null;
351         }
352       }
353       break;
354
355     case Token.WHITESPACE:
356     case Token.SEPARATOR:
357     default:
358       throw new ExtIllegalArgumentException("Unsupported property type {0}. (Leading) image \"{1}\".",
359                                             new Object JavaDoc[] { new Integer JavaDoc(property.getType()), image } );
360     }
361     
362     // return removed property
363
return prop;
364   }
365   
366
367   //---------------------------------------------------------------------------
368
// Methods of the TokenizerProperties interface
369
//
370

371   /**
372    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
373    * objects. See the method description in {@link TokenizerProperties}.
374    *
375    * @return enumeration of {@link TokenizerProperty} objects
376    */

377   public Iterator JavaDoc getStrings() {
378     return new SpecialSequencesIterator(this, _sequences, Token.STRING);
379   }
380   
381   /**
382    * Obtaining the whitespace character set.
383    * See the method description in {@link TokenizerProperties}.
384    *
385    * @see #setWhitespaces
386    * @return the currently active whitespace set
387    */

388   public String JavaDoc getWhitespaces() {
389     synchronized(this) {
390       return _whitespacesCase + _whitespacesNoCase;
391     }
392   }
393   
394   /**
395    * Obtaining the separator set of the <code>Tokenizer</code>.
396    * See the method description in {@link TokenizerProperties}.
397    *
398    * @see #setSeparators
399    * @return the currently used set of separating characters
400    */

401   public String JavaDoc getSeparators() {
402     synchronized(this) {
403       return _separatorsCase + _separatorsNoCase;
404     }
405   }
406   
407   /**
408    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
409    * objects.
410    * See the method description in {@link TokenizerProperties}.
411    *
412    * @return enumeration of {@link TokenizerProperty} objects
413    */

414   public Iterator JavaDoc getLineComments() {
415     return new SpecialSequencesIterator(this, _sequences, Token.LINE_COMMENT);
416   }
417   
418   /**
419    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
420    * objects.
421    * See the method description in {@link TokenizerProperties}.
422    *
423    * @return enumeration of {@link TokenizerProperty} objects
424    */

425   public Iterator JavaDoc getBlockComments() {
426     return new SpecialSequencesIterator(this, _sequences, Token.BLOCK_COMMENT);
427   }
428   
429   /**
430    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
431    * objects.
432    * See the method description in {@link TokenizerProperties}.
433    *
434    * @return enumeration of {@link TokenizerProperty} objects
435    */

436   public Iterator JavaDoc getSpecialSequences() {
437     return new SpecialSequencesIterator(this, _sequences, Token.SPECIAL_SEQUENCE);
438   }
439   
440   /**
441    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
442    * objects.
443    * See the method description in {@link TokenizerProperties}.
444    *
445    * @return iteration of {@link TokenizerProperty} objects
446    */

447   public Iterator JavaDoc getKeywords() {
448     return new SpecialSequencesIterator(this, _keywords, Token.KEYWORD);
449   }
450   
451   /**
452    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
453    * objects. Each <code>TokenizerProperty</code> object contains a pattern and
454    * its companion if such an associated object exists.
455    *
456    * @return enumeration of {@link TokenizerProperty} objects
457    */

458   public Iterator JavaDoc getPatterns() {
459     return new PatternIterator(this);
460   }
461   
462
463   /**
464    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
465    * objects.
466    * See the method description in {@link TokenizerProperties}.
467    *
468    * @return enumeration of {@link TokenizerProperty} objects
469    */

470   public Iterator JavaDoc getProperties() {
471     return new FullIterator(this);
472   }
473   
474   
475   //---------------------------------------------------------------------------
476
// Methods of the DataMapper interface
477
//
478

479   /**
480    * Setting the backing {@link TokenizerProperties} instance this <code>DataMapper</code>
481    * is working with. Usually, the <code>DataMapper</code>
482    * interface is implemented by <code>TokenizerProperties</code> implementations,
483    * too. Otherwise the {@link Tokenizer} using the <code>TokenizerProperties</code>,
484    * will construct a default <code>DataMapper</code> an propagate the
485    * <code>TokenizerProperties</code> instance by calling this method.
486    *<br>
487    * The method should throw an {@link java.lang.UnsupportedOperationException}
488    * if this <code>DataMapper</code> is an extension to an <code>TokenizerProperties</code>
489    * implementation.
490    *
491    * @param props the {@link de.susebox.jtopas.TokenizerProperties}
492    * @throws UnsupportedOperationException is this is a <code>DataMapper</code>
493    * implemented by a {@link de.susebox.jtopas.TokenizerProperties}
494    * implementation
495    * @throws NullPointerException if no {@link TokenizerProperties} are given
496    */

497   public void setTokenizerProperties(TokenizerProperties props)
498     throws UnsupportedOperationException JavaDoc, NullPointerException JavaDoc
499   {
500     throw new ExtUnsupportedOperationException(
501                   "Class {0} already defines the {1} interface.",
502                   new Object JavaDoc[] { StandardTokenizerProperties.class.getName(),
503                                  DataMapper.class.getName() } );
504   }
505
506   /**
507    * The method retrieves the backing {@link de.susebox.jtopas.TokenizerProperties}
508    * instance, this <code>DataMapper</code> is working on. For implementations
509    * of the <code>TokenizerProperties</code> interface that also implement the
510    * <code>DataMapper</code> interface, this method returns the instance itself
511    * it is called on.
512    *<br>
513    * Otherwise the method returns the <code>TokenizerProperties</code> instance
514    * passed through the last call to {@link #setTokenizerProperties} or <code>null</code>
515    * if no such call has taken place so far.
516    *
517    * @return the backing {@link de.susebox.jtopas.TokenizerProperties} or <code>null</code>
518    */

519   public TokenizerProperties getTokenizerProperties() {
520     return this;
521   }
522
523   /**
524    * This method checks if the character is a whitespace. Implement Your own
525    * code for situations where this default implementation is not fast enough
526    * or otherwise not really good.
527    *
528    * @param testChar check this character
529    * @return <code>true</code> if the given character is a whitespace,
530    * <code>false</code> otherwise
531    */

532   public boolean isWhitespace(char testChar) {
533     try {
534       return (_charFlags[testChar] & CHARFLAG_WHITESPACE) != 0;
535     } catch (ArrayIndexOutOfBoundsException JavaDoc ex) {
536       Integer JavaDoc extFlags = (Integer JavaDoc)_extCharFlags.get(new Integer JavaDoc(testChar));
537       return (extFlags != null && (extFlags.intValue() & CHARFLAG_WHITESPACE) != 0);
538     }
539   }
540       
541  
542   /**
543    * This method detects the number of whitespace characters the data range given
544    * through the {@link DataProvider} parameter starts with.
545    *
546    * @param dataProvider the source to get the data range from
547    * @return number of whitespace characters starting from the given offset
548    * @throws TokenizerException failure while reading data from the input stream
549    * @throws NullPointerException if no {@link DataProvider} is given
550    * @see de.susebox.jtopas.spi.DataProvider
551    */

552   public int countLeadingWhitespaces(DataProvider dataProvider) throws NullPointerException JavaDoc {
553     int maxChars = dataProvider.getLength();
554     int len = 0;
555     
556     while (len < maxChars && isWhitespace(dataProvider.getCharAt(len))) {
557       len++;
558     }
559     return len;
560   }
561   
562  
563   /**
564    * If a {@link Tokenizer} performs line counting, it is often nessecary to
565    * know if newline characters is considered to be a whitespace. See {@link WhitespaceHandler}
566    * for details.
567    *
568    * @return <code>true</code> if newline characters are in the current whitespace set,
569    * <code>false</code> otherwise
570    *
571    */

572   public boolean newlineIsWhitespace() {
573     return (_charFlags['\n'] & CHARFLAG_WHITESPACE) != 0
574           && (_charFlags['\r'] & CHARFLAG_WHITESPACE) != 0;
575   }
576   
577
578   /**
579    * This method checks the given character if it is a separator.
580    *
581    * @param testChar check this character
582    * @return <code>true</code> if the given character is a separator,
583    * <code>false</code> otherwise
584    */

585   public boolean isSeparator(char testChar) {
586     try {
587       return (_charFlags[testChar] & CHARFLAG_SEPARATOR) != 0;
588     } catch (ArrayIndexOutOfBoundsException JavaDoc ex) {
589       Integer JavaDoc extFlags = (Integer JavaDoc)_extCharFlags.get(new Integer JavaDoc(testChar));
590       return (extFlags != null && (extFlags.intValue() & CHARFLAG_SEPARATOR) != 0);
591     }
592   }
593
594   
595   /**
596    * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
597    * for a fast detection if special sequence checking must be performed at all.
598    * If the method returns <code>false</code> time-consuming preparations can be
599    * skipped.
600    *
601    * @return <code>true</code> if there actually are pattern that can be tested
602    * for a match, <code>false</code> otherwise.
603    */

604   public boolean hasSequenceCommentOrString() {
605     synchronized(_sequences) {
606       return (_sequences[0] != null || _sequences[1] != null);
607     }
608   }
609   
610   /**
611    * This method checks if a given range of data starts with a special sequence,
612    * a comment or a string. These three types of token are testet together since
613    * both comment and string prefixes are ordinary special sequences. Only the
614    * actions preformed <strong>after</strong> a string or comment has been detected,
615    * are different.
616    *<br>
617    * The method returns <code>null</code> if no special sequence, comment or string
618    * could matches the the leading part of the data range given through the
619    * {@link DataProvider}.
620    *<br>
621    * In cases of strings or comments, the return value contains the description
622    * for the introducing character sequence, <strong>NOT</strong> the whole
623    * string or comment. The reading of the rest of the string or comment is done
624    * by the calling {@link de.susebox.jtopas.Tokenizer}.
625    *
626    * @param dataProvider the source to get the data range from
627    * @return a {@link de.susebox.jtopas.TokenizerProperty} if a special sequence,
628    * comment or string could be detected, <code>null</code> otherwise
629    * @throws TokenizerException failure while reading more data
630    * @throws NullPointerException if no {@link DataProvider} is given
631    */

632   public TokenizerProperty startsWithSequenceCommentOrString(DataProvider dataProvider)
633     throws TokenizerException, NullPointerException JavaDoc
634   {
635     // we need the longest possible match
636
synchronized(_sequences) {
637       TokenizerProperty caseProp = (_sequences[0] != null) ?
638                                         _sequences[0].startsWithSequenceCommentOrString(dataProvider) : null;
639
640       TokenizerProperty noCaseProp = (_sequences[1] != null) ?
641                                         _sequences[1].startsWithSequenceCommentOrString(dataProvider) : null;
642
643       if (noCaseProp == null) {
644         return caseProp;
645       } else if (caseProp == null) {
646         return noCaseProp;
647       } else if (caseProp.getImages()[0].length() >= noCaseProp.getImages()[0].length()) {
648         return caseProp;
649       } else {
650         return noCaseProp;
651       }
652     }
653   }
654
655   /**
656    * This method returns the length of the longest special sequence, comment or
657    * string prefix that is known to this <code>SequenceHandler</code>. When
658    * calling {@link #startsWithSequenceCommentOrString}, the passed {@link DataProvider}
659    * parameter will supply at least this number of characters (see {@link DataProvider#getLength}).
660    * If less characters are provided, EOF is reached.
661    *
662    * @return the number of characters needed in the worst case to identify a
663    * special sequence
664    */

665   public int getSequenceMaxLength() {
666     int maxLength = 0;
667
668     synchronized(_sequences) {
669       if (_sequences[0] != null) {
670         maxLength = _sequences[0].getSequenceMaxLength();
671       }
672       if (_sequences[1] != null && _sequences[1].getSequenceMaxLength() > maxLength) {
673         maxLength = _sequences[1].getSequenceMaxLength();
674       }
675     }
676     return maxLength;
677   }
678
679   
680   /**
681    * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
682    * for a fast detection if keyword matching must be performed at all. If the method
683    * returns <code>false</code> time-consuming preparations can be skipped.
684    *
685    * @return <code>true</code> if there actually are pattern that can be tested
686    * for a match, <code>false</code> otherwise.
687    */

688   public boolean hasKeywords() {
689     synchronized(_keywords) {
690       return (_keywords[0] != null || _keywords[1] != null);
691     }
692   }
693   
694   /**
695    * This method checks if the character range given through the
696    * {@link DataProvider} comprises a keyword.
697    *
698    * @param dataProvider the source to get the data from, that are checked
699    * @return a {@link de.susebox.jtopas.TokenizerProperty} if a keyword could be
700    * found, <code>null</code> otherwise
701    * @throws TokenizerException failure while reading more data
702    * @throws NullPointerException if no {@link DataProvider} is given
703    */

704   public TokenizerProperty isKeyword(DataProvider dataProvider)
705     throws TokenizerException, NullPointerException JavaDoc
706   {
707     synchronized(_keywords) {
708       TokenizerProperty prop;
709     
710       if (_keywords[0] != null) {
711         prop = _keywords[0].isKeyword(dataProvider);
712       } else {
713         prop = null;
714       }
715       if (prop == null && _keywords[1] != null) {
716         prop = _keywords[1].isKeyword(dataProvider);
717       }
718       return prop;
719     }
720   }
721   
722   
723   /**
724    * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
725    * for a fast detection if pattern matching must be performed at all. If the method
726    * returns <code>false</code> time-consuming preparations can be skipped.
727    *
728    * @return <code>true</code> if there actually are pattern that can be tested
729    * for a match, <code>false</code> otherwise.
730    */

731   public boolean hasPattern() {
732     synchronized(_patterns) {
733       return (_patterns.size() > 0);
734     }
735   }
736     
737   /**
738    * This method checks if the start of a character range given through the
739    * {@link DataProvider} matches a pattern.
740    *
741    * @param dataProvider the source to get the data from
742    * @return a {@link PatternHandler.Result} object or <code>null</code> if no
743    * match was found
744    * @throws TokenizerException generic exception
745    * @throws NullPointerException if no {@link DataProvider} is given
746    */

747   public PatternHandler.Result matches(DataProvider dataProvider)
748     throws TokenizerException, NullPointerException JavaDoc
749   {
750     synchronized(_patterns) {
751       int longestMatch = 0;
752       PatternHandler.Result bestResult = null;
753       
754       // only get the string if pattern are available
755
for (int index = 0; index < _patterns.size(); ++index) {
756         PatternMatcher data = (PatternMatcher)_patterns.get(index);
757         PatternHandler.Result result = data.matches(dataProvider);
758
759         if (result != null) {
760           if (bestResult == null || bestResult.getLengthOfMatch() < result.getLengthOfMatch()) {
761             bestResult = result;
762           }
763         }
764       }
765       
766       // return the best result
767
return bestResult;
768     }
769   }
770
771   
772   //---------------------------------------------------------------------------
773
// Implementation
774
//
775

776   /**
777    * Registering a pattern with an associated object. The method assumes that the
778    * given pattern property has been checked for not being null, having a non-empty
779    * pattern image and normalized flags ({@link AbstractTokenizerProperties#normalizeFlags}).
780    * See the method description in {@link AbstractTokenizerProperties}.
781    *
782    * @param patternProp the regular expression to be added
783    * @return the replaced pattern property or <code>null</code>
784    * @throws IllegalArgumentException if pattern matching is not available
785    */

786   protected TokenizerProperty addPattern(TokenizerProperty patternProp) throws IllegalArgumentException JavaDoc {
787     // construct the pattern
788
PatternMatcher data = null;
789     String JavaDoc pattern = patternProp.getImages()[0];
790     
791     try {
792       data = new PatternMatcher(patternProp, getParseFlags());
793     } catch (Throwable JavaDoc ex) {
794       throw new ExtIllegalArgumentException(ex, "Pattern matching is not available (use JDK 1.4 or above).");
795     }
796                                                       
797     // Register pattern. First search for existing one
798
for (int index = 0; index < _patterns.size(); ++index) {
799       PatternMatcher oldData = (PatternMatcher)_patterns.get(index);
800       TokenizerProperty oldProp = oldData.getProperty();
801
802       if (oldProp.getImages()[0].equals(pattern)) {
803         _patterns.set(index, data);
804         return oldProp;
805       }
806     }
807
808     // not found -> register new pattern
809
_patterns.add(data);
810     return null;
811   }
812   
813   /**
814    * Registering a keyword property. The method assumes that the given keyword
815    * property has been checked for not being null, having a non-empty keyword
816    * image and normalized flags ({@link AbstractTokenizerProperties#normalizeFlags}).
817    *
818    * @param keywordProp keyword property to register
819    * @return the replaced keyword property or <code>null</code>
820    */

821   protected TokenizerProperty addKeyword(TokenizerProperty keywordProp) {
822     // case-sensitive keyword?
823
boolean noCase = isFlagSet(keywordProp, Flags.F_NO_CASE);
824     int arrayIdx = noCase ? 1 : 0;
825
826     // first keyword?
827
if (_keywords[arrayIdx] == null) {
828       if (noCase) {
829         _keywords[arrayIdx] = new NoCaseSequenceStore(true);
830       } else {
831         _keywords[arrayIdx] = new SequenceStore(true);
832       }
833     }
834
835     // add / replace property
836
return _keywords[arrayIdx].addKeyword(keywordProp);
837   }
838   
839   
840   /**
841    * This method adds or replaces strings, comments and ordinary special sequences.
842    * The method assumes that the given special sequence property has been checked
843    * for not being null, having a non-empty imagesand normalized flags
844    * ({@link AbstractTokenizerProperties#normalizeFlags}).
845    *
846    * @param property the description of the new sequence
847    * @return the replaced special sequence property or <code>null</code>
848    */

849   protected TokenizerProperty addSpecialSequence(TokenizerProperty property) {
850     // case-sensitive sequence?
851
boolean noCase = isFlagSet(property, Flags.F_NO_CASE);
852     int arrayIdx = noCase ? 1 : 0;
853
854     // first special sequence?
855
if (_sequences[arrayIdx] == null) {
856       if (noCase) {
857         _sequences[arrayIdx] = new NoCaseSequenceStore(false);
858       } else {
859         _sequences[arrayIdx] = new SequenceStore(false);
860       }
861     }
862
863     // add / replace property
864
return _sequences[arrayIdx].addSpecialSequence(property);
865   }
866   
867   /**
868    * Set or removes the flags corresponding to type and case-sensitivity from the
869    * character flags tables.
870    *
871    * @param set the character set to handle (may contain ranges)
872    * @param type token type fro the characters ({@link Token#WHITESPACE} or {@link Token#SEPARATOR})
873    * @param setIt if <code>true</code> the approbriate flags will be set, otherwise removed
874    */

875   private void putCharSet(String JavaDoc set, int type, boolean setIt) {
876     // which flags ?
877
int charFlags = 0;
878     
879     switch (type) {
880     case Token.WHITESPACE:
881       charFlags = CHARFLAG_WHITESPACE;
882       break;
883     case Token.SEPARATOR:
884       charFlags = CHARFLAG_SEPARATOR;
885       break;
886     }
887     
888     // analyze the given set
889
int length = (set != null) ? set.length() : 0;
890     char start, end, setChar;
891     
892     for (int ii = 0; ii < length; ++ii) {
893       setChar = set.charAt(ii);
894
895       switch (setChar) {
896       case '-':
897         start = (ii > 0) ? set.charAt(ii - 1) : 0;
898         end = (ii < length - 1) ? set.charAt(ii + 1) : 0xFFFF;
899         ii += 2;
900         break;
901
902       case '\\':
903         setChar = (ii + 1 >= length) ? 0 : set.charAt(ii + 1);
904         ii++;
905         /* no break */
906
907       default:
908         start = end = setChar;
909       }
910       
911       // put flags
912
for (char index = start; index <= end; ++index) {
913         char currChar = index;
914         
915         do {
916           if (currChar < _charFlags.length) {
917             // one-byte characters
918
if (setIt) {
919               _charFlags[currChar] |= charFlags;
920             } else {
921               _charFlags[currChar] &= ~charFlags;
922             }
923             
924           } else {
925             // longer characters
926
Integer JavaDoc key = new Integer JavaDoc(currChar);
927             Integer JavaDoc extFlags = (Integer JavaDoc)_extCharFlags.get(key);
928
929             if (setIt) {
930               extFlags = new Integer JavaDoc(extFlags.intValue() | charFlags);
931             } else {
932               extFlags = new Integer JavaDoc(extFlags.intValue() & ~charFlags);
933             }
934             _extCharFlags.put(key, extFlags);
935           }
936           
937           // settings must be also done for the upper/lowercase variant
938
if (Character.isLowerCase(currChar)) {
939             currChar = Character.toUpperCase(currChar);
940           } else if (Character.isUpperCase(currChar)) {
941             currChar = Character.toLowerCase(currChar);
942           }
943         } while ((_flags & Flags.F_NO_CASE) != 0 && currChar != index);
944       }
945     }
946   }
947   
948   
949   //---------------------------------------------------------------------------
950
// Class members
951
//
952

953   /**
954    * character flag for whitespaces
955    */

956   public static final int CHARFLAG_WHITESPACE = 1;
957   
958   /**
959    * character flag for whitespaces
960    */

961   public static final int CHARFLAG_SEPARATOR = 2;
962
963   
964   //---------------------------------------------------------------------------
965
// Members
966
//
967

968   /**
969    * array containing the flags for whitespaces and separators
970    */

971   protected int _charFlags[] = new int[256];
972   
973   /**
974    * Map with flags for characters beyond 256;
975    */

976   protected HashMap JavaDoc _extCharFlags = new HashMap JavaDoc();
977    
978   /**
979    * current whitespace characters including character ranges.
980    */

981   protected String JavaDoc _whitespacesCase = DEFAULT_WHITESPACES;
982   
983   /**
984    * current whitespace characters including character ranges. Case is ignored.
985    */

986   protected String JavaDoc _whitespacesNoCase = "";
987   
988   /**
989    * current separator characters including character ranges.
990    */

991   protected String JavaDoc _separatorsCase = DEFAULT_SEPARATORS;
992   
993   /**
994    * current separator characters including character ranges. Case is ignored.
995    */

996   protected String JavaDoc _separatorsNoCase = "";
997   
998   /**
999    * The first element is the {@link de.susebox.jtopas.impl.SequenceStore} for
1000   * the case-sensitive sequences, the second is for the case-insensitive ones.
1001   */

1002  protected SequenceStore[] _sequences = new SequenceStore[2];
1003  
1004  /**
1005   * Like the array {@link #_sequences} this two-element Array contains two
1006   * {@link de.susebox.jtopas.impl.SequenceStore}, the first for the case-sensitive
1007   * keywords, the second for the case-insensitive ones.
1008   */

1009  protected SequenceStore[] _keywords = new SequenceStore[2];
1010  
1011  /**
1012   * This array contains the patterns
1013   */

1014  protected ArrayList JavaDoc _patterns = new ArrayList JavaDoc();
1015  
1016  /**
1017   * Which regular expression parser to use
1018   */

1019  private Class JavaDoc _patternClass = null;
1020
1021  /**
1022   * A buffer used for pattern matching
1023   */

1024  private StringBuffer JavaDoc _foundMatch = new StringBuffer JavaDoc();
1025}
1026
1027
1028
1029//---------------------------------------------------------------------------
1030
// inner classes
1031
//
1032

1033/**
1034 * Instances of this inner class are returned when a call to
1035 * {@link TokenizerProperties#getProperties}.
1036 * Each element of the enumeration contains a {@link TokenizerProperty} element.
1037 */

1038final class FullIterator implements Iterator JavaDoc {
1039  
1040  /**
1041   * constructor taking the calling {@link TokenizerProperties} object to retrieve
1042   * the members holding {@link TokenizerProperty} elements which are iterated by
1043   * this <code>FullIterator</code> instance.
1044   *
1045   * @param caseSensitiveMap map with properties where case matters
1046   * @param caseSensitiveMap map with properties where case doesn't matter
1047   */

1048  public FullIterator(StandardTokenizerProperties parent) {
1049    _parent = parent;
1050    
1051    // create list of iterators
1052
_iterators = new Object JavaDoc[3];
1053    _iterators[0] = new SpecialSequencesIterator(parent, parent._keywords, Token.KEYWORD);
1054    _iterators[1] = new SpecialSequencesIterator(parent, parent._sequences, 0);
1055    _iterators[2] = new PatternIterator(parent);
1056    _currIndex = 0;
1057  }
1058
1059  /**
1060   * Test wether there is another element in the iterated set or not. See
1061   * {@link java.util.Iterator} for details.
1062   *
1063   * @return <code>true</code>if another call to {@link #next} will return an object,
1064   * <code>false</code> otherwise
1065   */

1066  public boolean hasNext() {
1067    synchronized(this) {
1068      while (_currIndex < _iterators.length) {
1069        Iterator JavaDoc iter = (Iterator JavaDoc)_iterators[_currIndex];
1070
1071        if (iter.hasNext()) {
1072          return true;
1073        }
1074        _currIndex++;
1075      }
1076      return false;
1077    }
1078  }
1079  
1080  /**
1081   * Retrieve the next element in the iterated set. See {@link java.util.Iterator}
1082   * for details.
1083   *
1084   * @return the next element or <code>null</code> if there is none
1085   */

1086  public Object JavaDoc next() {
1087    if (hasNext()) {
1088      synchronized(this) {
1089        Iterator JavaDoc iter = (Iterator JavaDoc)_iterators[_currIndex];
1090        return iter.next();
1091      }
1092    } else {
1093      return null;
1094    }
1095  }
1096  
1097  /**
1098   * Retrieve the next element in the iterated set. See {@link java.util.Iterator}
1099   * for details.
1100   *
1101   * @return the next element or <code>null</code> if there is none
1102   */

1103  public void remove() {
1104    if (_currIndex < _iterators.length) {
1105      Iterator JavaDoc iter = (Iterator JavaDoc)_iterators[_currIndex];
1106      iter.remove();
1107    }
1108  }
1109  
1110  
1111  // members
1112
private StandardTokenizerProperties _parent = null;
1113  private Object JavaDoc[] _iterators = null;
1114  private int _currIndex = -1;
1115}
1116
1117/**
1118 * Instances of this inner class are returned when a call to {@link TokenizerProperties#getKeywords}
1119 * or {@link TokenizerProperties#getPatterns}.
1120 * Each element of the enumeration contains a {@link TokenizerProperty} element,
1121 * that in turn has the keyword or a pattern with its companion
1122 */

1123final class MapIterator implements Iterator JavaDoc {
1124
1125  /**
1126   * constructor taking the a case-sensitive and a case-insensitive {@link java.util.Map}
1127   * which are iterated by this <code>MapIterator</code> instance.
1128   *
1129   * @param caseSensitiveMap map with properties where case matters
1130   * @param caseSensitiveMap map with properties where case doesn't matter
1131   */

1132  public MapIterator(StandardTokenizerProperties parent, Map JavaDoc caseSensitiveMap, Map JavaDoc caseInsensitiveMap) {
1133    synchronized(this) {
1134      _parent = parent;
1135      if (caseSensitiveMap != null) {
1136        _iterators[0] = caseSensitiveMap.values().iterator();
1137      }
1138      if (caseInsensitiveMap != null) {
1139        _iterators[1] = caseInsensitiveMap.values().iterator();
1140      }
1141    }
1142  }
1143
1144  /**
1145   * the well known method from the {@link java.util.Iterator} interface.
1146   *
1147   * @return <code>true</code> if there are more {@link TokenizerProperty}
1148   * elements, <code>false</code> otherwise
1149   */

1150  public boolean hasNext() {
1151    // check the current array
1152
synchronized(_iterators) {
1153      if (_iterators[0] != null) {
1154        if (_iterators[0].hasNext()) {
1155          return true;
1156        } else {
1157          _iterators[0] = null;
1158        }
1159      }
1160      if (_iterators[1] != null) {
1161        if (_iterators[1].hasNext()) {
1162          return true;
1163        } else {
1164          _iterators[1] = null;
1165        }
1166      }
1167      return false;
1168    }
1169  }
1170
1171  /**
1172   * Retrieve the next {@link TokenizerProperty} in this enumeration.
1173   *
1174   * @return the next keyword as a <code>TokenizerProperty</code>
1175   * @throws NoSuchElementException if there is no more element in this iterator
1176   */

1177  public Object JavaDoc next() {
1178    if ( ! hasNext()) {
1179      throw new NoSuchElementException JavaDoc();
1180    }
1181    
1182    synchronized(this) {
1183      if (_iterators[0] != null) {
1184        _currentData = (TokenizerProperty)_iterators[0].next();
1185      } else {
1186        _currentData = (TokenizerProperty)_iterators[1].next();
1187      }
1188      return _currentData;
1189    }
1190  }
1191  
1192  /**
1193   * This method is similar to {@link Tokenizer#removeKeyword}.
1194   *
1195   * @throws IllegalStateExcpetion if {@link #next} has not been called before or
1196   * <code>remove</code> has been called already after the last <code>next</code>.
1197   */

1198  public void remove() {
1199    synchronized(this) {
1200      // if current element is not set
1201
if (_currentData == null) {
1202        throw new IllegalStateException JavaDoc();
1203      }
1204    
1205      if (_iterators[0] != null) {
1206        _iterators[0].remove();
1207      } else {
1208        _iterators[1].remove();
1209      }
1210      _parent.notifyListeners(new TokenizerPropertyEvent(TokenizerPropertyEvent.PROPERTY_REMOVED, _currentData));
1211      _currentData = null;
1212    }
1213  }
1214
1215  // members
1216
private StandardTokenizerProperties _parent = null;
1217  private Iterator JavaDoc[] _iterators = new Iterator JavaDoc[2];
1218  private TokenizerProperty _currentData = null;
1219}
1220
1221
1222
1223/**
1224 * Iterator for comments, strings and special sequences.
1225 * Instances of this inner class are returned when a call to one of the methods
1226 *<ul><li>
1227 * {@link #getBlockComments}
1228 *</li><li>
1229 * {@link #getLineComments}
1230 *</li><li>
1231 * {@link #getStrings}
1232 *</li><li>
1233 * {@link #getSpecialSequences}
1234 *</li></ul>
1235 * is done. Each element of the enumeration contains a {@link TokenizerProperty}
1236 * element, that in turn has the comment, special sequence etc. together with
1237 * its companion
1238 */

1239final class SpecialSequencesIterator implements Iterator JavaDoc {
1240
1241  /**
1242   * constructor taking the calling <code>Tokenizer</code> and the type of the
1243   * {@link TokenizerProperty}. If the type is 0 then special sequences, line and
1244   * block comments are returned in one iterator
1245   *
1246   * @param parent the calling tokenizer
1247   * @param stores which array of {@link de.susebox.jtopas.impl.SequenceStore} to use
1248   * @param type type of the <code>TokenizerProperty</code>
1249   */

1250  public SpecialSequencesIterator(StandardTokenizerProperties parent, SequenceStore[] stores, int type) {
1251    _type = type;
1252    _parent = parent;
1253    _stores = stores;
1254  }
1255
1256  /**
1257   * the well known method from the {@link java.util.Iterator} interface.
1258   *
1259   * @return <code>true</code> if there are more {@link TokenizerProperty}
1260   * elements, <code>false</code> otherwise
1261   */

1262  public boolean hasNext() {
1263    synchronized(this) {
1264      if (_currentIterator != null && _currentIterator.hasNext()) {
1265        return true;
1266      }
1267
1268      while (_stores != null && ++_currentIndex < _stores.length) {
1269        if (_stores[_currentIndex] != null) {
1270          _currentIterator = _stores[_currentIndex].getSpecialSequences(_type);
1271          if (_currentIterator.hasNext()) {
1272            return true;
1273          }
1274        }
1275      }
1276      return false;
1277    }
1278  }
1279
1280  /**
1281   * Retrieve the next {@link TokenizerProperty} in this enumeration.
1282   *
1283   * @return a {@link TokenizerProperty} of the desired type or <code>null</code>
1284   * @throws NoSuchElementException if there is no more element in this iterator
1285   */

1286  public Object JavaDoc next() throws NoSuchElementException JavaDoc {
1287    synchronized(this) {
1288      if (! hasNext()) {
1289        throw new NoSuchElementException JavaDoc();
1290      }
1291      _currentElement = (TokenizerProperty)_currentIterator.next();
1292      return _currentElement;
1293    }
1294  }
1295  
1296  /**
1297   * Remove the current special sequence entry from the collection. This is an
1298   * alternative to {@link Tokenizer#removeSpecialSequence}.
1299   *
1300   * @throws IllegalStateExcpetion if {@link #next} has not been called before or
1301   * <code>remove</code> has been called already after the last <code>next</code>.
1302   */

1303  public void remove() throws IllegalStateException JavaDoc {
1304    synchronized(this) {
1305      // if current element is not set
1306
if (_currentElement == null) {
1307        throw new IllegalStateException JavaDoc();
1308      }
1309    
1310      // remove current element
1311
try {
1312        _currentIterator.remove();
1313        _parent.notifyListeners(new TokenizerPropertyEvent(TokenizerPropertyEvent.PROPERTY_REMOVED, _currentElement));
1314        _currentElement = null;
1315      } catch (Exception JavaDoc ex) {
1316        throw new ExtRuntimeException(ex, "While trying to remove current element of a SpecialSequencesIterator.");
1317      }
1318    }
1319  }
1320
1321
1322  // members
1323
private StandardTokenizerProperties _parent = null;
1324  private SequenceStore[] _stores = null;
1325  private TokenizerProperty _currentElement = null;
1326  private Iterator JavaDoc _currentIterator = null;
1327  private int _currentIndex = -1;
1328  private int _type = Token.UNKNOWN;
1329}
1330
1331
1332/**
1333 * An {@link java.util.Iterator} for pattern.
1334 */

1335final class PatternIterator implements Iterator JavaDoc {
1336  /**
1337   * constructor taking the calling {@link TokenizerProperties} object.
1338   *
1339   * @param parent the caller
1340   */

1341  public PatternIterator(StandardTokenizerProperties parent) {
1342    _parent = parent;
1343    synchronized(parent._patterns) {
1344      _iterator = parent._patterns.iterator();
1345    }
1346  }
1347
1348  /**
1349   * the well known method from the {@link java.util.Iterator} interface.
1350   *
1351   * @return <code>true</code> if there are more {@link TokenizerProperty}
1352   * elements, <code>false</code> otherwise
1353   */

1354  public boolean hasNext() {
1355    return _iterator.hasNext();
1356  }
1357
1358  /**
1359   * Retrieve the next {@link TokenizerProperty} in this enumeration.
1360   *
1361   * @return the next keyword as a <code>TokenizerProperty</code>
1362   * @throws NoSuchElementException if there is no more element in this iterator
1363   */

1364  public Object JavaDoc next() throws NoSuchElementException JavaDoc {
1365    synchronized(this) {
1366      _currentData = (PatternMatcher)_iterator.next();
1367      return _currentData.getProperty();
1368    }
1369  }
1370  
1371  /**
1372   * This method is similar to {@link Tokenizer#removeKeyword}
1373   */

1374  public void remove() {
1375    synchronized(this) {
1376      _iterator.remove();
1377      _parent.notifyListeners(new TokenizerPropertyEvent(TokenizerPropertyEvent.PROPERTY_REMOVED, _currentData.getProperty()));
1378    }
1379  }
1380
1381  // members
1382
private StandardTokenizerProperties _parent = null;
1383  private Iterator JavaDoc _iterator = null;
1384  private PatternMatcher _currentData = null;
1385}
1386
Popular Tags