StandardTokenizerProperties


1   /*
2    * StandardTokenizerProperties.java: general-use TokenizerProperties implementation
3    *
4    * Copyright (C) 2002 Heiko Blau
5    *
6    * This file belongs to the JTopas Library.
7    * JTopas is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your 
10   * option) any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along
18   * with JTopas. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * Contact:
28   *   email: heiko@susebox.de 
29   */
30  
31  package de.susebox.jtopas;
32  
33  //-----------------------------------------------------------------------------
34  // Imports
35  //
36  import java.util.Arrays  ;
37  import java.util.ArrayList  ;
38  import java.util.Map  ;
39  import java.util.HashMap  ;
40  import java.util.Iterator  ;
41  import java.util.NoSuchElementException  ;
42  
43  import de.susebox.java.lang.ExtRuntimeException;
44  import de.susebox.java.lang.ExtUnsupportedOperationException;
45  import de.susebox.java.lang.ExtIllegalArgumentException;
46  
47  import de.susebox.jtopas.spi.DataMapper;
48  import de.susebox.jtopas.spi.DataProvider;
49  import de.susebox.jtopas.spi.PatternHandler;
50  
51  import de.susebox.jtopas.impl.PatternMatcher;
52  import de.susebox.jtopas.impl.SequenceStore;
53  import de.susebox.jtopas.impl.NoCaseSequenceStore;
54  
55  
56  //-----------------------------------------------------------------------------
57  // Class StandardTokenizerProperties
58  //
59  
60  /**<p>
61   * The class <code>StandardTokenizerProperties</code> provides a simple implementation
62   * of the {@link TokenizerProperties} interface for use in most situations.
63   *</p><p>
64   * Note that this class takes advantage of JTopas features that use Java 1.4 or
65   * higher. It can still be used in older environments but not compiled with JDK
66   * versions below 1.4!
67   *</p>
68   *
69   * @see TokenizerProperties
70   * @see Tokenizer
71   * @author Heiko Blau
72   */
73  public class StandardTokenizerProperties
74    extends     AbstractTokenizerProperties
75    implements  TokenizerProperties, DataMapper
76  {
77    
78    //---------------------------------------------------------------------------
79    // Properties
80    //
81    
82    /**
83     * Maximum length of a non-free pattern match. These are patterns that dont
84     * have the {@link TokenizerProperties#F_FREE_PATTERN} flag set. A common 
85     * example are number patterns.
86     */
87    public static final short MAX_NONFREE_MATCHLEN = 1024;
88    
89    
90    //---------------------------------------------------------------------------
91    // Constructors
92    //
93    
94    /**
95     * Default constructor that intitializes an instance with the default whitespaces
96     * and separator sets. {@link Tokenizer} instances using this <code>StandardTokenizerProperties</code>
97     * object, split text between spaces, tabs and line ending sequences as well
98     * as between punctuation characters.
99     */  
100   public StandardTokenizerProperties() {
101     this(0);
102   }
103 
104   /**
105    * This constructor takes the control flags to be used. It is a shortcut to:
106    * <pre>
107    *   TokenizerProperties props = new StandardTokenizerProperties();
108    *
109    *   props.setParseFlags(flags);
110    * </pre>
111    * See the {@link TokenizerProperties} interface for the supported flags.
112    *<br>
113    * The {@link TokenizerProperties#DEFAULT_WHITESPACES} and 
114    * {@link TokenizerProperties#DEFAULT_SEPARATORS} are used for whitespace and 
115    * separator handling if no explicit calls to {@link #setWhitespaces} and 
116    * {@link #setSeparators} will follow subsequently.
117    *
118    * @param flags     tokenizer control flags
119    * @see   #setParseFlags
120    */  
121   public StandardTokenizerProperties(int flags) {
122     this(flags, DEFAULT_WHITESPACES, DEFAULT_SEPARATORS);
123   }
124   
125   
126   /**
127    * This constructor takes the whitespace and separator sets to be used. It is 
128    * a shortcut to:
129    * <pre>
130    *   TokenizerProperties props = new StandardTokenizerProperties();
131    *
132    *   props.setWhitespaces(ws);
133    *   props.setSeparators(sep);
134    * </pre>
135    *
136    * @param flags       tokenizer control flags
137    * @param whitespaces the whitespace set
138    * @param separators  the set of separating characters
139    * @see   #setParseFlags
140    * @see   #setWhitespaces
141    * @see   #setSeparators
142    */  
143   public StandardTokenizerProperties(int flags, String   whitespaces, String   separators) {
144     Arrays.fill(_charFlags, 0);
145     setParseFlags(flags);
146     setWhitespaces(whitespaces);
147     setSeparators(separators);
148   }
149   
150   
151   //---------------------------------------------------------------------------
152   // Abstract methods of the base class
153   //
154 
155   /**
156    * Retrieving a property by a given type and image. See the method description
157    * in {@link AbstractTokenizerProperties} for details.
158    *
159    * @param   type        the type the returned property should have
160    * @param   startImage  the (starting) image
161    * @return  the token description for the image or <code>null</code>
162    */
163   protected TokenizerProperty doGetProperty(int type, String   startImage) {
164     TokenizerProperty prop = null;
165     
166     switch (type) {
167     case Token.KEYWORD:
168       if (_keywords[0] != null) {
169         prop = _keywords[0].getKeyword(startImage);
170       }
171       if (prop == null && _keywords[1] != null) {
172         prop = _keywords[1].getKeyword(startImage);
173       }
174       break;
175       
176     case Token.STRING:
177     case Token.LINE_COMMENT:
178     case Token.BLOCK_COMMENT:
179     case Token.SPECIAL_SEQUENCE:
180       if (_sequences[0] != null) {
181         prop = _sequences[0].getSpecialSequence(startImage);
182       }
183       if (prop == null && _sequences[1] != null) {
184         prop = _sequences[1].getSpecialSequence(startImage);
185       } 
186       break;
187       
188     case Token.PATTERN:
189       for (int index = 0; index < _patterns.size(); ++index) {
190         PatternMatcher    data = (PatternMatcher)_patterns.get(index);
191 
192         prop = data.getProperty();
193         if (prop.getImages()[0].equals(startImage)) {
194           break;
195         }
196         prop = null;
197       }
198       break;
199 
200     case Token.WHITESPACE:
201     case Token.SEPARATOR:
202     default:
203       throw new ExtIllegalArgumentException("Unsupported property type {0}. (Leading) image \"{1}\".", 
204                                             new Object  [] { new Integer  (type), startImage } );
205     }
206 
207     // either the required property or null
208     return prop;
209   }  
210   
211   
212   /**
213    * Setting a new separator set. See the method description in 
214    * {@link AbstractTokenizerProperties} for details.
215    *
216    * @param   separators    the set of separators including ranges
217    * @return  the replaced separator set or <code>null</code>
218    */
219   protected String   doSetSeparators(String   separators) {
220     String   oldValue;
221 
222     // which separators should be set?
223     if ((_flags & Flags.F_NO_CASE) == 0) {
224       oldValue          = (_separatorsCase.length() > 0) ? _separatorsCase : _separatorsNoCase;
225       _separatorsCase   = separators;
226       _separatorsNoCase = "";
227     } else {
228       oldValue          = (_separatorsNoCase.length() > 0) ? _separatorsNoCase : _separatorsCase;
229       _separatorsCase   = "";
230       _separatorsNoCase = separators;
231     }
232 
233     // mark seaparators in character table
234     putCharSet(oldValue,   Token.SEPARATOR, false);
235     putCharSet(separators, Token.SEPARATOR, true);
236 
237     // normalize the old value
238     if (oldValue == null || oldValue.length() == 0) {
239       return null;
240     } else {
241       return oldValue;
242     }
243   }
244   
245   /**
246    * Setting a new whitespace set. See the method description in 
247    * {@link AbstractTokenizerProperties} for details.
248    *
249    * @param   whitespaces   the set of whitespaces including ranges
250    * @return  the replaced whitespace set or <code>null</code>
251    */
252   protected String   doSetWhitespaces(String   whitespaces) {
253     // set the right whitespaces
254     String   oldValue;
255 
256     if ((_flags & Flags.F_NO_CASE) == 0) {
257       oldValue            = (_whitespacesCase.length() > 0) ? _whitespacesCase : _whitespacesNoCase;
258       _whitespacesCase    = whitespaces;
259       _whitespacesNoCase  = "";
260     } else {
261       oldValue            = (_whitespacesNoCase.length() > 0) ? _whitespacesNoCase : _whitespacesCase;
262       _whitespacesCase    = "";
263       _whitespacesNoCase  = whitespaces;
264     }
265 
266     // mark whitespaces in character table
267     putCharSet(oldValue,    Token.WHITESPACE, false);
268     putCharSet(whitespaces, Token.WHITESPACE, true);
269 
270     // return changes
271     if (oldValue == null || oldValue.length() == 0) {
272       return null;
273     } else {
274       return oldValue;
275     }
276   }
277   
278   /**
279    * Registering a {@link TokenizerProperty}.
280    * See the method description in {@link AbstractTokenizerProperties}.
281    *
282    * @param   property   property to register
283    * @return  the replaced property or <code>null</code>
284    */
285   protected TokenizerProperty doAddProperty(TokenizerProperty property) {
286     switch (property.getType()) {
287     case Token.STRING:
288     case Token.LINE_COMMENT:
289     case Token.BLOCK_COMMENT:
290     case Token.SPECIAL_SEQUENCE:
291       return addSpecialSequence(property);
292 
293     case Token.KEYWORD:
294       return addKeyword(property);
295 
296     case Token.PATTERN:
297       return addPattern(property);
298 
299     case Token.WHITESPACE:
300     case Token.SEPARATOR:
301     default:
302       throw new ExtIllegalArgumentException("Unsupported property type {0}. (Leading) image \"{1}\".", 
303                                             new Object  [] { new Integer  (property.getType()), property.getImages()[0] } );
304     }
305   }
306   
307   /**
308    * Deregistering a {@link TokenizerProperty} from the store.
309    * See the method description in {@link AbstractTokenizerProperties}.
310    *
311    * @param   property    property to remove
312    * @return  the replaced property or <code>null</code>
313    */  
314   protected TokenizerProperty doRemoveProperty(TokenizerProperty property) {
315     // removing property according to type
316     TokenizerProperty prop  = null;
317     String              image = property.getImages()[0];
318     
319     switch (property.getType()) {
320     case Token.LINE_COMMENT:
321     case Token.BLOCK_COMMENT:
322     case Token.STRING:
323     case Token.SPECIAL_SEQUENCE:
324       if (_sequences[0] != null) {
325         prop = _sequences[0].removeSpecialSequence(image);
326       } 
327       if (prop == null && _sequences[1] != null) {
328         prop = _sequences[1].removeSpecialSequence(image);
329       }
330       break;
331 
332     case Token.KEYWORD:
333       if (_keywords[0] != null) {
334         prop = _keywords[0].removeKeyword(image);
335       } 
336       if (prop == null && _keywords[1] != null) {
337         prop = _keywords[1].removeKeyword(image);
338       }
339       break;
340 
341     case Token.PATTERN:
342       for (int index = 0; index < _patterns.size(); ++index) {
343         PatternMatcher    data = (PatternMatcher)_patterns.get(index);
344 
345         prop = data.getProperty();
346         if (prop.getImages()[0].equals(image)) {
347           _patterns.remove(index);
348           break;
349         } else {
350           prop = null;
351         }
352       }
353       break;
354 
355     case Token.WHITESPACE:
356     case Token.SEPARATOR:
357     default:
358       throw new ExtIllegalArgumentException("Unsupported property type {0}. (Leading) image \"{1}\".", 
359                                             new Object  [] { new Integer  (property.getType()), image } );
360     }
361     
362     // return removed property
363     return prop;
364   }
365   
366 
367   //---------------------------------------------------------------------------
368   // Methods of the TokenizerProperties interface
369   //
370   
371   /**
372    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
373    * objects. See the method description in {@link TokenizerProperties}.
374    *
375    * @return enumeration of {@link TokenizerProperty} objects
376    */  
377   public Iterator   getStrings() {
378     return new SpecialSequencesIterator(this, _sequences, Token.STRING);
379   }
380   
381   /**
382    * Obtaining the whitespace character set.
383    * See the method description in {@link TokenizerProperties}.
384    *
385    * @see #setWhitespaces
386    * @return the currently active whitespace set
387    */
388   public String   getWhitespaces() {
389     synchronized(this) {
390       return _whitespacesCase + _whitespacesNoCase;
391     }
392   }
393   
394   /**
395    * Obtaining the separator set of the <code>Tokenizer</code>.
396    * See the method description in {@link TokenizerProperties}.
397    *
398    * @see #setSeparators
399    * @return the currently used set of separating characters
400    */
401   public String   getSeparators() {
402     synchronized(this) {
403       return _separatorsCase + _separatorsNoCase;
404     }
405   }
406   
407   /**
408    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
409    * objects.
410    * See the method description in {@link TokenizerProperties}.
411    *
412    * @return enumeration of {@link TokenizerProperty} objects
413    */  
414   public Iterator   getLineComments() {
415     return new SpecialSequencesIterator(this, _sequences, Token.LINE_COMMENT);
416   }
417   
418   /**
419    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
420    * objects.
421    * See the method description in {@link TokenizerProperties}.
422    *
423    * @return enumeration of {@link TokenizerProperty} objects
424    */  
425   public Iterator   getBlockComments() {
426     return new SpecialSequencesIterator(this, _sequences, Token.BLOCK_COMMENT);
427   }
428   
429   /**
430    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
431    * objects.
432    * See the method description in {@link TokenizerProperties}.
433    *
434    * @return enumeration of {@link TokenizerProperty} objects
435    */  
436   public Iterator   getSpecialSequences() {
437     return new SpecialSequencesIterator(this, _sequences, Token.SPECIAL_SEQUENCE);
438   }
439   
440   /**
441    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
442    * objects.
443    * See the method description in {@link TokenizerProperties}.
444    *
445    * @return iteration of {@link TokenizerProperty} objects
446    */  
447   public Iterator   getKeywords() {
448     return new SpecialSequencesIterator(this, _keywords, Token.KEYWORD);
449   }
450   
451   /**
452    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
453    * objects. Each <code>TokenizerProperty</code> object contains a pattern and 
454    * its companion if such an associated object exists.
455    *
456    * @return enumeration of {@link TokenizerProperty} objects
457    */  
458   public Iterator   getPatterns() {
459     return new PatternIterator(this);
460   }
461   
462 
463   /**
464    * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
465    * objects.
466    * See the method description in {@link TokenizerProperties}.
467    *
468    * @return enumeration of {@link TokenizerProperty} objects
469    */  
470   public Iterator   getProperties() {
471     return new FullIterator(this);
472   }
473   
474   
475   //---------------------------------------------------------------------------
476   // Methods of the DataMapper interface
477   //
478   
479   /**
480    * Setting the backing {@link TokenizerProperties} instance this <code>DataMapper</code> 
481    * is working with. Usually, the <code>DataMapper</code>
482    * interface is implemented by <code>TokenizerProperties</code> implementations,
483    * too. Otherwise the {@link Tokenizer} using the <code>TokenizerProperties</code>, 
484    * will construct a default <code>DataMapper</code> an propagate the 
485    * <code>TokenizerProperties</code> instance by calling this method.
486    *<br>
487    * The method should throw an {@link java.lang.UnsupportedOperationException}
488    * if this <code>DataMapper</code> is an extension to an <code>TokenizerProperties</code>
489    * implementation.
490    *
491    * @param   props   the {@link de.susebox.jtopas.TokenizerProperties}
492    * @throws  UnsupportedOperationException is this is a <code>DataMapper</code>
493    *          implemented by a {@link de.susebox.jtopas.TokenizerProperties}
494    *          implementation
495    * @throws  NullPointerException  if no {@link TokenizerProperties} are given
496    */
497   public void setTokenizerProperties(TokenizerProperties props) 
498     throws UnsupportedOperationException  , NullPointerException  
499   {
500     throw new ExtUnsupportedOperationException(
501                   "Class {0} already defines the {1} interface.",
502                   new Object  [] { StandardTokenizerProperties.class.getName(), 
503                                  DataMapper.class.getName() } );
504   }
505 
506   /**
507    * The method retrieves the backing {@link de.susebox.jtopas.TokenizerProperties}
508    * instance, this <code>DataMapper</code> is working on. For implementations
509    * of the <code>TokenizerProperties</code> interface that also implement the
510    * <code>DataMapper</code> interface, this method returns the instance itself
511    * it is called on.
512    *<br>
513    * Otherwise the method returns the <code>TokenizerProperties</code> instance 
514    * passed through the last call to {@link #setTokenizerProperties} or <code>null</code>
515    * if no such call has taken place so far.
516    *
517    * @return the backing {@link de.susebox.jtopas.TokenizerProperties} or <code>null</code>
518    */
519   public TokenizerProperties getTokenizerProperties() {
520     return this;
521   }
522 
523   /**
524    * This method checks if the character is a whitespace. Implement Your own
525    * code for situations where this default implementation is not fast enough
526    * or otherwise not really good.
527    *
528    * @param testChar  check this character
529    * @return <code>true</code> if the given character is a whitespace,
530    *         <code>false</code> otherwise
531    */
532   public boolean isWhitespace(char testChar) {
533     try {
534       return (_charFlags[testChar] & CHARFLAG_WHITESPACE) != 0;
535     } catch (ArrayIndexOutOfBoundsException   ex) {
536       Integer   extFlags = (Integer  )_extCharFlags.get(new Integer  (testChar));
537       return (extFlags != null && (extFlags.intValue() & CHARFLAG_WHITESPACE) != 0);
538     }
539   }
540       
541  
542   /**
543    * This method detects the number of whitespace characters the data range given
544    * through the {@link DataProvider} parameter starts with.
545    *
546    * @param   dataProvider  the source to get the data range from
547    * @return  number of whitespace characters starting from the given offset
548    * @throws  TokenizerException failure while reading data from the input stream
549    * @throws  NullPointerException  if no {@link DataProvider} is given
550    * @see     de.susebox.jtopas.spi.DataProvider
551    */
552   public int countLeadingWhitespaces(DataProvider dataProvider) throws NullPointerException   {
553     int maxChars = dataProvider.getLength();
554     int len      = 0;
555     
556     while (len < maxChars && isWhitespace(dataProvider.getCharAt(len))) {
557       len++;
558     }
559     return len;
560   }
561   
562  
563   /** 
564    * If a {@link Tokenizer} performs line counting, it is often nessecary to
565    * know if newline characters is considered to be a whitespace. See {@link WhitespaceHandler}
566    * for details.
567    *
568    * @return  <code>true</code> if newline characters are in the current whitespace set,
569    *          <code>false</code> otherwise
570    *
571    */
572   public boolean newlineIsWhitespace() {
573     return   (_charFlags['\n'] & CHARFLAG_WHITESPACE) != 0
574           && (_charFlags['\r'] & CHARFLAG_WHITESPACE) != 0;
575   }  
576   
577 
578   /**
579    * This method checks the given character if it is a separator.
580    *
581    * @param testChar  check this character
582    * @return <code>true</code> if the given character is a separator,
583    *         <code>false</code> otherwise
584    */
585   public boolean isSeparator(char testChar) {
586     try {
587       return (_charFlags[testChar] & CHARFLAG_SEPARATOR) != 0;
588     } catch (ArrayIndexOutOfBoundsException   ex) {
589       Integer   extFlags = (Integer  )_extCharFlags.get(new Integer  (testChar));
590       return (extFlags != null && (extFlags.intValue() & CHARFLAG_SEPARATOR) != 0);
591     }
592   }
593 
594   
595   /**
596    * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation 
597    * for a fast detection if special sequence checking must be performed at all. 
598    * If the method returns <code>false</code> time-consuming preparations can be 
599    * skipped.
600    *
601    * @return  <code>true</code> if there actually are pattern that can be tested
602    *          for a match, <code>false</code> otherwise.
603    */
604   public boolean hasSequenceCommentOrString() {
605     synchronized(_sequences) {
606       return (_sequences[0] != null || _sequences[1] != null);
607     }
608   }
609   
610   /**
611    * This method checks if a given range of data starts with a special sequence,
612    * a comment or a string. These three types of token are testet together since
613    * both comment and string prefixes are ordinary special sequences. Only the 
614    * actions preformed <strong>after</strong> a string or comment has been detected,
615    * are different.
616    *<br>
617    * The method returns <code>null</code> if no special sequence, comment or string 
618    * could matches the the leading part of the data range given through the
619    * {@link DataProvider}.
620    *<br>
621    * In cases of strings or comments, the return value contains the description
622    * for the introducing character sequence, <strong>NOT</strong> the whole
623    * string or comment. The reading of the rest of the string or comment is done
624    * by the calling {@link de.susebox.jtopas.Tokenizer}.
625    *
626    * @param   dataProvider  the source to get the data range from
627    * @return  a {@link de.susebox.jtopas.TokenizerProperty} if a special sequence, 
628    *          comment or string could be detected, <code>null</code> otherwise
629    * @throws  TokenizerException failure while reading more data
630    * @throws  NullPointerException  if no {@link DataProvider} is given
631    */
632   public TokenizerProperty startsWithSequenceCommentOrString(DataProvider dataProvider) 
633     throws TokenizerException, NullPointerException  
634   {
635     // we need the longest possible match
636     synchronized(_sequences) {
637       TokenizerProperty caseProp   = (_sequences[0] != null) ? 
638                                         _sequences[0].startsWithSequenceCommentOrString(dataProvider) : null;
639 
640       TokenizerProperty noCaseProp = (_sequences[1] != null) ? 
641                                         _sequences[1].startsWithSequenceCommentOrString(dataProvider) : null;
642 
643       if (noCaseProp == null) {
644         return caseProp;
645       } else if (caseProp == null) {
646         return noCaseProp;
647       } else if (caseProp.getImages()[0].length() >= noCaseProp.getImages()[0].length()) {
648         return caseProp;
649       } else {
650         return noCaseProp;
651       }
652     }
653   }
654 
655   /**
656    * This method returns the length of the longest special sequence, comment or
657    * string prefix that is known to this <code>SequenceHandler</code>. When
658    * calling {@link #startsWithSequenceCommentOrString}, the passed {@link DataProvider}
659    * parameter will supply at least this number of characters (see {@link DataProvider#getLength}).
660    * If less characters are provided, EOF is reached.
661    *
662    * @return  the number of characters needed in the worst case to identify a 
663    *          special sequence
664    */
665   public int getSequenceMaxLength() {
666     int maxLength = 0;
667 
668     synchronized(_sequences) {
669       if (_sequences[0] != null) {
670         maxLength = _sequences[0].getSequenceMaxLength();
671       }
672       if (_sequences[1] != null && _sequences[1].getSequenceMaxLength() > maxLength) {
673         maxLength = _sequences[1].getSequenceMaxLength();
674       }
675     }
676     return maxLength;
677   }
678 
679   
680   /**
681    * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation 
682    * for a fast detection if keyword matching must be performed at all. If the method
683    * returns <code>false</code> time-consuming preparations can be skipped.
684    *
685    * @return  <code>true</code> if there actually are pattern that can be tested
686    *          for a match, <code>false</code> otherwise.
687    */
688   public boolean hasKeywords() {
689     synchronized(_keywords) {
690       return (_keywords[0] != null || _keywords[1] != null);
691     }
692   }
693   
694   /**
695    * This method checks if the character range given through the 
696    * {@link DataProvider} comprises a keyword.
697    *
698    * @param   dataProvider  the source to get the data from, that are checked
699    * @return  a {@link de.susebox.jtopas.TokenizerProperty} if a keyword could be 
700    *          found, <code>null</code> otherwise
701    * @throws  TokenizerException failure while reading more data
702    * @throws  NullPointerException  if no {@link DataProvider} is given
703    */
704   public TokenizerProperty isKeyword(DataProvider dataProvider)
705     throws TokenizerException, NullPointerException  
706   {
707     synchronized(_keywords) {
708       TokenizerProperty prop;
709     
710       if (_keywords[0] != null) {
711         prop = _keywords[0].isKeyword(dataProvider);
712       } else {
713         prop = null;
714       }
715       if (prop == null && _keywords[1] != null) {
716         prop = _keywords[1].isKeyword(dataProvider);
717       }
718       return prop;
719     }
720   }
721   
722   
723   /**
724    * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation 
725    * for a fast detection if pattern matching must be performed at all. If the method
726    * returns <code>false</code> time-consuming preparations can be skipped.
727    *
728    * @return  <code>true</code> if there actually are pattern that can be tested
729    *          for a match, <code>false</code> otherwise.
730    */
731   public boolean hasPattern() {
732     synchronized(_patterns) {
733       return (_patterns.size() > 0);
734     }
735   }
736     
737   /**
738    * This method checks if the start of a character range given through the 
739    * {@link DataProvider} matches a pattern.
740    *
741    * @param   dataProvider    the source to get the data from
742    * @return  a {@link PatternHandler.Result} object or <code>null</code> if no
743    *          match was found
744    * @throws  TokenizerException    generic exception
745    * @throws  NullPointerException  if no {@link DataProvider} is given
746    */
747   public PatternHandler.Result matches(DataProvider dataProvider)
748     throws TokenizerException, NullPointerException  
749   {
750     synchronized(_patterns) {
751       int                   longestMatch = 0;
752       PatternHandler.Result bestResult   = null;
753       
754       // only get the string if pattern are available
755       for (int index = 0; index < _patterns.size(); ++index) {
756         PatternMatcher        data = (PatternMatcher)_patterns.get(index);
757         PatternHandler.Result result = data.matches(dataProvider);
758 
759         if (result != null) {
760           if (bestResult == null || bestResult.getLengthOfMatch() < result.getLengthOfMatch()) {
761             bestResult = result;
762           }
763         }
764       }
765       
766       // return the best result
767       return bestResult;
768     } 
769   }
770 
771   
772   //---------------------------------------------------------------------------
773   // Implementation
774   //
775 
776   /**
777    * Registering a pattern with an associated object. The method assumes that the 
778    * given pattern property has been checked for not being null, having a non-empty 
779    * pattern image and normalized flags ({@link AbstractTokenizerProperties#normalizeFlags}).
780    * See the method description in {@link AbstractTokenizerProperties}.
781    *
782    * @param   patternProp     the regular expression to be added
783    * @return  the replaced pattern property or <code>null</code>
784    * @throws  IllegalArgumentException if pattern matching is not available
785    */
786   protected TokenizerProperty addPattern(TokenizerProperty patternProp) throws IllegalArgumentException   {
787     // construct the pattern
788     PatternMatcher  data = null;
789     String            pattern = patternProp.getImages()[0];
790     
791     try {
792       data = new PatternMatcher(patternProp, getParseFlags());
793     } catch (Throwable   ex) {
794       throw new ExtIllegalArgumentException(ex, "Pattern matching is not available (use JDK 1.4 or above).");
795     }
796                                                       
797     // Register pattern. First search for existing one
798     for (int index = 0; index < _patterns.size(); ++index) {
799       PatternMatcher    oldData = (PatternMatcher)_patterns.get(index);
800       TokenizerProperty oldProp = oldData.getProperty();
801 
802       if (oldProp.getImages()[0].equals(pattern)) {
803         _patterns.set(index, data);
804         return oldProp;
805       }
806     }
807 
808     // not found -> register new pattern
809     _patterns.add(data);
810     return null;
811   }
812   
813   /**
814    * Registering a keyword property. The method assumes that the given keyword 
815    * property has been checked for not being null, having a non-empty keyword 
816    * image and normalized flags ({@link AbstractTokenizerProperties#normalizeFlags}).
817    *
818    * @param   keywordProp   keyword property to register
819    * @return  the replaced keyword property or <code>null</code>
820    */  
821   protected TokenizerProperty addKeyword(TokenizerProperty keywordProp) {
822     // case-sensitive keyword?
823     boolean noCase   = isFlagSet(keywordProp, Flags.F_NO_CASE);
824     int     arrayIdx = noCase ? 1 : 0;
825 
826     // first keyword?
827     if (_keywords[arrayIdx] == null) {
828       if (noCase) {
829         _keywords[arrayIdx] = new NoCaseSequenceStore(true);
830       } else {
831         _keywords[arrayIdx] = new SequenceStore(true);
832       }
833     }
834 
835     // add / replace property
836     return _keywords[arrayIdx].addKeyword(keywordProp);
837   }
838   
839   
840   /**
841    * This method adds or replaces strings, comments and ordinary special sequences.
842    * The method assumes that the given special sequence property has been checked 
843    * for not being null, having a non-empty imagesand normalized flags 
844    * ({@link AbstractTokenizerProperties#normalizeFlags}).
845    *
846    * @param   property  the description of the new sequence
847    * @return  the replaced special sequence property or <code>null</code>
848    */
849   protected TokenizerProperty addSpecialSequence(TokenizerProperty property) {
850     // case-sensitive sequence?
851     boolean noCase   = isFlagSet(property, Flags.F_NO_CASE);
852     int     arrayIdx = noCase ? 1 : 0;
853 
854     // first special sequence?
855     if (_sequences[arrayIdx] == null) {
856       if (noCase) {
857         _sequences[arrayIdx] = new NoCaseSequenceStore(false);      
858       } else {
859         _sequences[arrayIdx] = new SequenceStore(false);      
860       }
861     }
862 
863     // add / replace property
864     return _sequences[arrayIdx].addSpecialSequence(property);
865   }
866   
867   /**
868    * Set or removes the flags corresponding to type and case-sensitivity from the
869    * character flags tables.
870    *
871    * @param set   the character set to handle (may contain ranges)
872    * @param type  token type fro the characters ({@link Token#WHITESPACE} or {@link Token#SEPARATOR})
873    * @param setIt if <code>true</code> the approbriate flags will be set, otherwise removed
874    */
875   private void putCharSet(String   set, int type, boolean setIt) {
876     // which flags ?
877     int charFlags = 0;
878     
879     switch (type) {
880     case Token.WHITESPACE:
881       charFlags = CHARFLAG_WHITESPACE;
882       break;
883     case Token.SEPARATOR:
884       charFlags = CHARFLAG_SEPARATOR;
885       break;
886     }
887     
888     // analyze the given set
889     int   length = (set != null) ? set.length() : 0;
890     char  start, end, setChar;
891     
892     for (int ii = 0; ii < length; ++ii)  {
893       setChar = set.charAt(ii);
894 
895       switch (setChar) {
896       case '-':
897         start = (ii > 0) ? set.charAt(ii - 1) : 0;
898         end   = (ii < length - 1) ? set.charAt(ii + 1) : 0xFFFF;
899         ii += 2; 
900         break;
901 
902       case '\\':
903         setChar = (ii + 1 >= length) ? 0 : set.charAt(ii + 1);
904         ii++;
905         /* no break */
906 
907       default:
908         start = end = setChar;
909       }
910       
911       // put flags
912       for (char index = start; index <= end; ++index) {
913         char currChar = index;
914         
915         do {
916           if (currChar < _charFlags.length) {
917             // one-byte characters 
918             if (setIt) {
919               _charFlags[currChar] |= charFlags;
920             } else {
921               _charFlags[currChar] &= ~charFlags;
922             }
923             
924           } else {
925             // longer characters
926             Integer   key      = new Integer  (currChar);
927             Integer   extFlags = (Integer  )_extCharFlags.get(key);
928 
929             if (setIt) {
930               extFlags = new Integer  (extFlags.intValue() | charFlags);
931             } else {
932               extFlags = new Integer  (extFlags.intValue() & ~charFlags);
933             }
934             _extCharFlags.put(key, extFlags);
935           }
936           
937           // settings must be also done for the upper/lowercase variant 
938           if (Character.isLowerCase(currChar)) {
939             currChar = Character.toUpperCase(currChar);
940           } else if (Character.isUpperCase(currChar)) {
941             currChar = Character.toLowerCase(currChar);
942           }
943         } while ((_flags & Flags.F_NO_CASE) != 0 && currChar != index);
944       }
945     }
946   }
947   
948   
949   //---------------------------------------------------------------------------
950   // Class members
951   //
952   
953   /**
954    * character flag for whitespaces
955    */
956   public static final int CHARFLAG_WHITESPACE = 1;
957   
958   /**
959    * character flag for whitespaces
960    */
961   public static final int CHARFLAG_SEPARATOR = 2;
962 
963   
964   //---------------------------------------------------------------------------
965   // Members
966   //
967   
968   /**
969    * array containing the flags for whitespaces and separators
970    */
971   protected int _charFlags[] = new int[256];
972   
973   /**
974    * Map with flags for characters beyond 256;
975    */
976   protected HashMap   _extCharFlags = new HashMap  ();
977    
978   /**
979    * current whitespace characters including character ranges.
980    */
981   protected String   _whitespacesCase = DEFAULT_WHITESPACES;
982   
983   /**
984    * current whitespace characters including character ranges. Case is ignored.
985    */
986   protected String   _whitespacesNoCase = "";
987   
988   /**
989    * current separator characters including character ranges.
990    */
991   protected String   _separatorsCase = DEFAULT_SEPARATORS;
992   
993   /**
994    * current separator characters including character ranges. Case is ignored.
995    */
996   protected String   _separatorsNoCase = "";
997   
998   /**
999    * The first element is the {@link de.susebox.jtopas.impl.SequenceStore} for 
1000   * the case-sensitive sequences, the second is for the case-insensitive ones.
1001   */
1002  protected SequenceStore[] _sequences = new SequenceStore[2];
1003  
1004  /**
1005   * Like the array {@link #_sequences} this two-element Array contains two
1006   * {@link de.susebox.jtopas.impl.SequenceStore}, the first for the case-sensitive 
1007   * keywords, the second for the case-insensitive ones.
1008   */
1009  protected SequenceStore[] _keywords = new SequenceStore[2];
1010  
1011  /**
1012   * This array contains the patterns
1013   */
1014  protected ArrayList   _patterns = new ArrayList  ();
1015  
1016  /**
1017   * Which regular expression parser to use
1018   */
1019  private Class   _patternClass = null;
1020
1021  /**
1022   * A buffer used for pattern matching
1023   */
1024  private StringBuffer   _foundMatch = new StringBuffer  ();
1025}
1026
1027
1028
1029//---------------------------------------------------------------------------
1030// inner classes
1031//
1032
1033/**
1034 * Instances of this inner class are returned when a call to 
1035 * {@link TokenizerProperties#getProperties}.
1036 * Each element of the enumeration contains a {@link TokenizerProperty} element.
1037 */
1038final class FullIterator implements Iterator   {
1039  
1040  /**
1041   * constructor taking the calling {@link TokenizerProperties} object to retrieve
1042   * the members holding {@link TokenizerProperty} elements which are iterated by 
1043   * this <code>FullIterator</code> instance.
1044   *
1045   * @param caseSensitiveMap  map with properties where case matters
1046   * @param caseSensitiveMap  map with properties where case doesn't matter
1047   */
1048  public FullIterator(StandardTokenizerProperties parent) {
1049    _parent = parent;
1050    
1051    // create list of iterators
1052    _iterators    = new Object  [3];
1053    _iterators[0] = new SpecialSequencesIterator(parent, parent._keywords, Token.KEYWORD);
1054    _iterators[1] = new SpecialSequencesIterator(parent, parent._sequences, 0);
1055    _iterators[2] = new PatternIterator(parent);
1056    _currIndex    = 0;
1057  }
1058
1059  /**
1060   * Test wether there is another element in the iterated set or not. See
1061   * {@link java.util.Iterator} for details.
1062   *
1063   * @return <code>true</code>if another call to {@link #next} will return an object,
1064   *        <code>false</code> otherwise
1065   */
1066  public boolean hasNext() {
1067    synchronized(this) {
1068      while (_currIndex < _iterators.length) {
1069        Iterator   iter = (Iterator  )_iterators[_currIndex];
1070
1071        if (iter.hasNext()) {
1072          return true;
1073        }
1074        _currIndex++;
1075      }
1076      return false;
1077    }
1078  }
1079  
1080  /**
1081   * Retrieve the next element in the iterated set. See {@link java.util.Iterator} 
1082   * for details.
1083   *
1084   * @return the next element or <code>null</code> if there is none
1085   */
1086  public Object   next() {
1087    if (hasNext()) {
1088      synchronized(this) {
1089        Iterator   iter = (Iterator  )_iterators[_currIndex];
1090        return iter.next();
1091      }
1092    } else {
1093      return null;
1094    }
1095  }
1096  
1097  /**
1098   * Retrieve the next element in the iterated set. See {@link java.util.Iterator} 
1099   * for details.
1100   *
1101   * @return the next element or <code>null</code> if there is none
1102   */
1103  public void remove() {
1104    if (_currIndex < _iterators.length) {
1105      Iterator   iter = (Iterator  )_iterators[_currIndex];
1106      iter.remove();
1107    }
1108  }
1109  
1110  
1111  // members
1112  private StandardTokenizerProperties _parent     = null;
1113  private Object  []                    _iterators  = null;
1114  private int                         _currIndex  = -1;
1115}
1116
1117/**
1118 * Instances of this inner class are returned when a call to {@link TokenizerProperties#getKeywords}
1119 * or {@link TokenizerProperties#getPatterns}.
1120 * Each element of the enumeration contains a {@link TokenizerProperty} element,
1121 * that in turn has the keyword or a pattern with its companion
1122 */
1123final class MapIterator implements Iterator   {
1124
1125  /**
1126   * constructor taking the a case-sensitive and a case-insensitive {@link java.util.Map}
1127   * which are iterated by this <code>MapIterator</code> instance.
1128   *
1129   * @param caseSensitiveMap  map with properties where case matters
1130   * @param caseSensitiveMap  map with properties where case doesn't matter
1131   */
1132  public MapIterator(StandardTokenizerProperties parent, Map   caseSensitiveMap, Map   caseInsensitiveMap) {
1133    synchronized(this) {
1134      _parent = parent;
1135      if (caseSensitiveMap != null) {
1136        _iterators[0] = caseSensitiveMap.values().iterator();
1137      }
1138      if (caseInsensitiveMap != null) {
1139        _iterators[1] = caseInsensitiveMap.values().iterator();
1140      }
1141    }
1142  }
1143
1144  /**
1145   * the well known method from the {@link java.util.Iterator} interface.
1146   *
1147   * @return <code>true</code> if there are more {@link TokenizerProperty}
1148   *         elements, <code>false</code> otherwise
1149   */
1150  public boolean hasNext() {
1151    // check the current array
1152    synchronized(_iterators) {
1153      if (_iterators[0] != null) {
1154        if (_iterators[0].hasNext()) {
1155          return true;
1156        } else {
1157          _iterators[0] = null;
1158        }
1159      }
1160      if (_iterators[1] != null) {
1161        if (_iterators[1].hasNext()) {
1162          return true;
1163        } else {
1164          _iterators[1] = null;
1165        }
1166      }
1167      return false;
1168    }
1169  }
1170
1171  /**
1172   * Retrieve the next {@link TokenizerProperty} in this enumeration. 
1173   *
1174   * @return the next keyword as a <code>TokenizerProperty</code>
1175   * @throws NoSuchElementException if there is no more element in this iterator
1176   */
1177  public Object   next() {
1178    if ( ! hasNext()) {
1179      throw new NoSuchElementException  ();
1180    }
1181    
1182    synchronized(this) {
1183      if (_iterators[0] != null) {
1184        _currentData = (TokenizerProperty)_iterators[0].next();
1185      } else {
1186        _currentData = (TokenizerProperty)_iterators[1].next();
1187      }
1188      return _currentData;
1189    }
1190  }
1191  
1192  /**
1193   * This method is similar to {@link Tokenizer#removeKeyword}.
1194   *
1195   * @throws  IllegalStateExcpetion if {@link #next} has not been called before or
1196   *          <code>remove</code> has been called already after the last <code>next</code>.
1197   */
1198  public void remove() {
1199    synchronized(this) {
1200      // if current element is not set
1201      if (_currentData == null) {
1202        throw new IllegalStateException  ();
1203      }
1204    
1205      if (_iterators[0] != null) {
1206        _iterators[0].remove();
1207      } else {
1208        _iterators[1].remove();
1209      }
1210      _parent.notifyListeners(new TokenizerPropertyEvent(TokenizerPropertyEvent.PROPERTY_REMOVED, _currentData));
1211      _currentData = null;
1212    }
1213  }
1214
1215  // members
1216  private StandardTokenizerProperties _parent     = null;
1217  private Iterator  []                  _iterators  = new Iterator  [2];
1218  private TokenizerProperty           _currentData   = null;
1219}
1220
1221
1222
1223/**
1224 * Iterator for comments, strings and special sequences.
1225 * Instances of this inner class are returned when a call to one of the methods
1226 *<ul><li>
1227 *    {@link #getBlockComments}
1228 *</li><li>
1229 *    {@link #getLineComments}
1230 *</li><li>
1231 *    {@link #getStrings}
1232 *</li><li>
1233 *    {@link #getSpecialSequences}
1234 *</li></ul>
1235 * is done. Each element of the enumeration contains a {@link TokenizerProperty}
1236 * element, that in turn has the comment, special sequence etc. together with
1237 * its companion
1238 */
1239final class SpecialSequencesIterator implements Iterator   {
1240
1241  /**
1242   * constructor taking the calling <code>Tokenizer</code> and the type of the
1243   * {@link TokenizerProperty}. If the type is 0 then special sequences, line and 
1244   * block comments are returned in one iterator
1245   *
1246   * @param parent  the calling tokenizer
1247   * @param stores  which array of {@link de.susebox.jtopas.impl.SequenceStore} to use
1248   * @param type    type of the <code>TokenizerProperty</code> 
1249   */
1250  public SpecialSequencesIterator(StandardTokenizerProperties parent, SequenceStore[] stores, int type) {
1251    _type   = type;
1252    _parent = parent;
1253    _stores = stores;
1254  }
1255
1256  /**
1257   * the well known method from the {@link java.util.Iterator} interface.
1258   *
1259   * @return <code>true</code> if there are more {@link TokenizerProperty}
1260   *         elements, <code>false</code> otherwise
1261   */
1262  public boolean hasNext() {
1263    synchronized(this) {
1264      if (_currentIterator != null && _currentIterator.hasNext()) {
1265        return true;
1266      }
1267
1268      while (_stores != null && ++_currentIndex < _stores.length) {
1269        if (_stores[_currentIndex] != null) {
1270          _currentIterator = _stores[_currentIndex].getSpecialSequences(_type);
1271          if (_currentIterator.hasNext()) {
1272            return true;
1273          }
1274        }
1275      }
1276      return false;
1277    }
1278  }
1279
1280  /**
1281   * Retrieve the next {@link TokenizerProperty} in this enumeration.
1282   *
1283   * @return a {@link TokenizerProperty} of the desired type or <code>null</code>
1284   * @throws NoSuchElementException if there is no more element in this iterator
1285   */
1286  public Object   next() throws NoSuchElementException   {
1287    synchronized(this) {
1288      if (! hasNext()) {
1289        throw new NoSuchElementException  ();
1290      }
1291      _currentElement = (TokenizerProperty)_currentIterator.next();
1292      return _currentElement;
1293    }
1294  }
1295  
1296  /**
1297   * Remove the current special sequence entry from the collection. This is an
1298   * alternative to {@link Tokenizer#removeSpecialSequence}.
1299   *
1300   * @throws  IllegalStateExcpetion if {@link #next} has not been called before or
1301   *          <code>remove</code> has been called already after the last <code>next</code>.
1302   */
1303  public void remove() throws IllegalStateException   {
1304    synchronized(this) {
1305      // if current element is not set
1306      if (_currentElement == null) {
1307        throw new IllegalStateException  ();
1308      }
1309    
1310      // remove current element
1311      try {
1312        _currentIterator.remove();
1313        _parent.notifyListeners(new TokenizerPropertyEvent(TokenizerPropertyEvent.PROPERTY_REMOVED, _currentElement));
1314        _currentElement = null;
1315      } catch (Exception   ex) {
1316        throw new ExtRuntimeException(ex, "While trying to remove current element of a SpecialSequencesIterator.");
1317      }
1318    }
1319  }
1320
1321
1322  // members
1323  private StandardTokenizerProperties _parent           = null;
1324  private SequenceStore[]             _stores           = null;
1325  private TokenizerProperty           _currentElement   = null;
1326  private Iterator                      _currentIterator  = null;
1327  private int                         _currentIndex     = -1;
1328  private int                         _type             = Token.UNKNOWN;
1329}
1330
1331
1332/**
1333 * An {@link java.util.Iterator} for pattern.
1334 */
1335final class PatternIterator implements Iterator   {
1336  /**
1337   * constructor taking the calling {@link TokenizerProperties} object.
1338   *
1339   * @param parent  the caller
1340   */
1341  public PatternIterator(StandardTokenizerProperties parent) {
1342    _parent   = parent;
1343    synchronized(parent._patterns) {
1344      _iterator = parent._patterns.iterator();
1345    }
1346  }
1347
1348  /**
1349   * the well known method from the {@link java.util.Iterator} interface.
1350   *
1351   * @return <code>true</code> if there are more {@link TokenizerProperty}
1352   *         elements, <code>false</code> otherwise
1353   */
1354  public boolean hasNext() {
1355    return _iterator.hasNext();
1356  }
1357
1358  /**
1359   * Retrieve the next {@link TokenizerProperty} in this enumeration. 
1360   *
1361   * @return  the next keyword as a <code>TokenizerProperty</code>
1362   * @throws NoSuchElementException if there is no more element in this iterator
1363   */
1364  public Object   next() throws NoSuchElementException   {
1365    synchronized(this) {
1366      _currentData = (PatternMatcher)_iterator.next();
1367      return _currentData.getProperty();
1368    }
1369  }
1370  
1371  /**
1372   * This method is similar to {@link Tokenizer#removeKeyword}
1373   */
1374  public void remove() {
1375    synchronized(this) {
1376      _iterator.remove();
1377      _parent.notifyListeners(new TokenizerPropertyEvent(TokenizerPropertyEvent.PROPERTY_REMOVED, _currentData.getProperty()));
1378    }
1379  }
1380
1381  // members
1382  private StandardTokenizerProperties _parent = null;
1383  private Iterator                      _iterator = null;
1384  private PatternMatcher              _currentData = null;
1385}
1386
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags