StandardTokenizer


1   /*
2    * StandardTokenizer.java: core class for lexical parser.
3    *
4    * Copyright (C) 2001 Heiko Blau
5    *
6    * This file belongs to the JTopas Library.
7    * JTopas is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your 
10   * option) any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along
18   * with JTopas. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * Contact:
28   *   email: heiko@susebox.de 
29   */
30  
31  package de.susebox.jtopas;
32  
33  //-----------------------------------------------------------------------------
34  // Imports
35  //
36  import de.susebox.java.lang.ExtIndexOutOfBoundsException;
37  import de.susebox.jtopas.spi.DataProvider;
38  
39  
40  //-----------------------------------------------------------------------------
41  // Class StandardTokenizer
42  //
43  
44  /**<p>
45   * This is the mainstream {@link Tokenizer}. It implements the {@link Tokenizer}
46   * interface in a straightforward approach without too specialized parse
47   * optimizations.
48   * </p><p>
49   * Beside the {@link Tokenizer} interface, the class <code>StandardTokenizer</code>
50   * provides some basic features for cascading (nested) tokenizers. Consider the usual
51   * HTML pages found today in the WWW. Most of them are a mixture of regular HTML,
52   * cascading style sheets (CSS) and embedded JavaScript. These different languages
53   * use different syntaxes, so one needs varous tokenizers on the same input stream.
54   *</p><p>
55   * This {@link Tokenizer} implementation is not synchronized. Take care when using
56   * with multible threads.
57   *</p>
58   *
59   * @see Tokenizer
60   * @see TokenizerProperties
61   * @author Heiko Blau
62   */
63  public class StandardTokenizer 
64    extends AbstractTokenizer 
65    implements Tokenizer, TokenizerPropertyListener 
66  {
67    //---------------------------------------------------------------------------
68    // Constructors
69    //
70    
71    /**
72     * Default constructor that sets the tokenizer control flags as it would be
73     * approbriate for C/C++ and Java. Found token images are copied. No line nor
74     * column informations are provided. Nested comments are not allowed.
75     *<br>
76     * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES} 
77     * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and 
78     * separator handling.
79     */  
80    public StandardTokenizer() {}
81    
82    /**
83     * Contructing a <code>StandardTokenizer</code> with a backing {@link TokenizerProperties}
84     * instance.
85     *
86     * @param properties  an {@link TokenizerProperties} object containing the 
87     *                    settings for the tokenizing process
88     */
89    public StandardTokenizer(TokenizerProperties properties) {
90      super.setTokenizerProperties(properties);
91    }
92  
93    
94    //---------------------------------------------------------------------------
95    // Methods of the Tokenizer interface
96    //
97    
98    /**
99     * This method returns the absolute offset in characters to the start of the
100    * parsed stream. See the method description in {@link Tokenizer}.
101    *
102    * @return the absolute offset of the current text window in characters from 
103    *         the start of the data source of the Tokenizer
104    * @see #getReadPosition
105    */
106   public int getRangeStart() {
107     return _rangeStart;
108   }
109   
110   /**
111    * Additionally to the common behaviour implemented in 
112    * {@link #de.susebox.jtopas.AbstractTokenizer#setSource}, this method ajusts
113    * the state speicific to the <code>StandardTokenizer</code> class.
114    *
115    * @param source  a {@link TokenizerSource} to read data from
116    */
117   public void setSource(TokenizerSource source) {
118     super.setSource(source);
119     _hasBeenRead = false;
120     _rangeStart  = 0;
121     try {
122       _charSequenceTokenizerSource = (CharSequenceTokenizerSource)getSource();
123       _dataProvider                = new StringDataProvider(_charSequenceTokenizerSource, 0, 0);
124     } catch (ClassCastException   ex) {
125       _charSequenceTokenizerSource = null;
126       _dataProvider = new CharArrayDataProvider(_inputBuffer, 0, 0);
127     }
128   }
129 
130   /**
131    * Closing this tokenizer frees resources.
132    */
133   public void close() {
134     _inputBuffer                  = null;
135     _rangeStart                   = 0;
136     _hasBeenRead                  = false;
137     _charSequenceTokenizerSource  = null;
138     _dataProvider                 = null;
139     super.close();
140   }
141   
142   
143   //---------------------------------------------------------------------------
144   // Implementation
145   //
146 
147   /**
148    * Implements the abstract method of the base class. 
149    *
150    * @param startPos    position in the input data
151    * @param length      number of characters
152    */
153   protected DataProvider getDataProvider(int startPos, int length) {
154     _dataProvider.setDataRange(startPos - getRangeStart(), length);
155     return _dataProvider;
156   }
157 
158   /**
159    * This method organizes the input buffer. It moves the current text window if
160    * nessecary or allocates more space, if data should be kept completely (see the
161    * {@link TokenizerProperties#F_KEEP_DATA} flag).
162    * Its main purpose is to call the {@link TokenizerSource#read} method.
163    *
164    * @return  number of read bytes or -1 if an end-of-file condition occured
165    * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read} 
166    *          method
167    */
168   protected int readMoreData() throws TokenizerException  {
169     if (_charSequenceTokenizerSource != null) {
170       // new CharSequenceTokenizerSource
171       if (_hasBeenRead || _charSequenceTokenizerSource.length() <= 0) {
172         return -1;
173       } else {
174         _hasBeenRead = true;
175         return _charSequenceTokenizerSource.length();
176       }
177     
178     } else {
179       // no input buffer so far
180       if (_inputBuffer == null) {
181         if (isFlagSet(Flags.F_KEEP_DATA)) {
182           _inputBuffer = new char[LARGE_BUFFER_INITSIZE];   // 64k
183         } else {
184           _inputBuffer = new char[SMALL_BUFFER_INITSIZE];    // 8k
185         }
186         ((CharArrayDataProvider)_dataProvider).setData(_inputBuffer);
187       }
188 
189       // this is a good moment to move already read data if the write position is
190       // near the end of the buffer and there is a certain space before the current
191       // read position
192       int readPos  = getReadPosition() - getRangeStart();
193       int writePos = currentlyAvailable();
194 
195       if ( ! isFlagSet(Flags.F_KEEP_DATA)) {
196         if ((readPos > _inputBuffer.length / 4) && (writePos > (3 * _inputBuffer.length) / 4)) {
197           reorganizeInputBuffer(_inputBuffer);
198           writePos = currentlyAvailable();
199         }
200       }
201 
202       // if there is no space any more and data couldn't be moved (see above)
203       // we need a new input buffer
204       if (writePos >= _inputBuffer.length) {
205         _inputBuffer = reorganizeInputBuffer(new char[_inputBuffer.length * 2]);
206         writePos     = currentlyAvailable();
207         ((CharArrayDataProvider)_dataProvider).setData(_inputBuffer);
208       }
209 
210       // read data
211       int chars = 0;
212 
213       while (chars == 0) {
214         try {
215           chars = getSource().read(_inputBuffer, writePos, _inputBuffer.length - writePos);
216         } catch (Exception   ex) {
217           throw new TokenizerException(ex);
218         }
219       }
220       return chars;
221     }
222   }
223   
224   /**
225    * Move data in the input buffer and adjust various position values.
226    */
227   private char[] reorganizeInputBuffer(char[] newBuffer) {
228     int readPos  = getReadPosition() - getRangeStart();
229     int writePos = currentlyAvailable();
230 
231     if ( ! isFlagSet(Flags.F_KEEP_DATA)) {
232       System.arraycopy(_inputBuffer, readPos, newBuffer, 0, writePos - readPos);
233       _rangeStart += readPos;
234     } else {
235       System.arraycopy(_inputBuffer, 0, newBuffer, 0, writePos);
236     }
237     return newBuffer;
238   }
239 
240   
241   //---------------------------------------------------------------------------
242   // Inner classes
243   //
244 
245   /**
246    * Base class for the various implementations of the 
247    * {@link de.susebox.jtopas.spi.DataProvider} interface for the {@link StandardTokenizer}.
248    */
249   private abstract class AbstractDataProvider implements DataProvider {
250 
251     /**
252      * The constructor takes the nessecary parameters for the methods defined
253      * below
254      *
255      * @param startPosition   valid data start here
256      * @param length          count of characters starting at startPosition
257      */
258     public AbstractDataProvider(int startPosition, int length) {
259       setDataRange(startPosition, length);
260     }
261 
262     /**
263      * Retrieving the position where the data to analyze start in the buffer provided
264      * by {@link #getData}. The calling {@link de.susebox.jtopas.spi.DataMapper} 
265      * must not access data prior to this index in the character array.
266      *
267      * @return  index in the character array returned by {@link #getData}, where data starts
268      */
269     public int getStartPosition() {
270       return _startPosition;
271     }
272 
273     /**
274      * Retrieving the maximum number of characters in the array provided by {@link getData}
275      * that can be analyzed by the calling {@link de.susebox.jtopas.spi.DataMapper}.
276      *
277      * @param testChar  check this character
278      * @return <code>true</code> if the given character is a separator,
279      *         <code>false</code> otherwise
280      */
281     public int getLength() {
282       return _length;
283     }
284 
285     /**
286      * Setting the start position and the length in the data buffer of this 
287      * instance.
288      *
289      * @param startPosition   valid data start here
290      * @param length          count of characters starting at startPosition
291      */
292     protected void setDataRange(int startPosition, int length) {
293       _startPosition  = startPosition;
294       _length         = length;
295     }
296 
297     // Members
298     protected int _startPosition;
299     protected int _length;
300   }
301 
302   /**
303    * Implementation of the {@link de.susebox.jtopas.spi.DataProvider} interface 
304    * for the {@link StandardTokenizer}.
305    */
306   private final class CharArrayDataProvider extends AbstractDataProvider implements DataProvider {
307 
308     /**
309      * The constructor takes the nessecary parameters for the methods defined
310      * below
311      */
312     public CharArrayDataProvider(char[] data, int startPosition, int length) {
313       super(startPosition, length);
314       setData(data);
315       _dataAsString = null;
316     }
317 
318     /**
319      * See {@link de.susebox.jtopas.spi.DataProvider#getCharAt} for details.
320      *
321      * @param   index   an index between 0 and {@link #getLength} 
322      * @return  the character at the given position
323      */
324     public char getCharAt(int index) {
325       return _data[_startPosition + index];
326     }
327 
328     /**
329      * See {@link de.susebox.jtopas.spi.DataProvider#getData} for details.
330      *
331      * @return the character buffer to read data from
332      */
333     public char[] getData() {
334       return _data;
335     }
336 
337     /**
338      * See {@link de.susebox.jtopas.spi.DataProvider#getDataCopy} for details.
339      *
340      * @return  a copy of the valid data of this {@link DataProvider}
341      * @see #getData
342      * @see #toString
343      */
344     public char[] getDataCopy() {
345       char[] copy = new char[getLength()];
346 
347       System.arraycopy(_data, getStartPosition(), copy, 0, copy.length);
348       return copy;
349     }
350 
351     /**
352      * Returning the valid data range of this <code>DataProvider</code> as a string.
353      * This method is an alternative to {@link #getDataCopy}.
354      *
355      * @return the string representation of the valid data range
356      */
357     public String   toString() {
358       if (_dataAsString == null) {
359         if (_data != null) {
360           _dataAsString = new String  (_data, _startPosition, _length);
361         } else {
362           _dataAsString = "";
363         }
364       }
365       return _dataAsString;
366     }
367 
368     /**
369      * Setting the data buffer of this instance.
370      */
371     protected void setData(char[] data) {
372       _data = data;
373     }
374 
375     /**
376      * Setting the start position and the length in the data buffer of this 
377      * instance.
378      *
379      * @param startPosition   valid data start here
380      * @param length          count of characters starting at startPosition
381      */
382     protected void setDataRange(int startPosition, int length) {
383       super.setDataRange(startPosition, length);
384       _dataAsString = null;
385     }
386     
387     // Members
388     private char[] _data;
389     private String   _dataAsString;
390   }
391 
392 
393   /**
394    * Implementation of the {@link de.susebox.jtopas.spi.DataProvider} 
395    * interface for {@link CharSequenceTokenizerSource} sources.
396    */
397   private final class StringDataProvider extends AbstractDataProvider implements DataProvider {
398 
399     /**
400      * The constructor takes the nessecary parameters for the methods defined
401      * below
402      */
403     public StringDataProvider(CharSequenceTokenizerSource source, int startPosition, int length) {
404       super(startPosition, length);
405       setData(source);
406     }
407 
408 
409     //---------------------------------------------------------------------------
410     // methods of the DataProvider interface
411     //
412 
413     /**
414      * See {@link de.susebox.jtopas.spi.DataProvider#getCharAt} for details.
415      *
416      * @param   index   the index of the character starting from {@link #getStartPosition}
417      * @return the character at the given position
418      */
419     public char getCharAt(int index) {
420       return _source.charAt(_startPosition + index);
421     }
422 
423     /**
424      * See {@link de.susebox.jtopas.spi.DataProvider#getData} for details.
425      *
426      * @return the character buffer to read data from
427      */
428     public char[] getData() {
429       return _source.toString().toCharArray();
430     }
431 
432     /**
433      * See {@link de.susebox.jtopas.spi.DataProvider#getDataCopy} for details.
434      *
435      * @return  a copy of the valid data of this {@link DataProvider}
436      * @see #getData
437      * @see #toString
438      */
439     public char[] getDataCopy() {
440       return toString().toCharArray();
441     }
442 
443     /**
444      * Returning the valid data range of this <code>DataProvider</code> as a string.
445      * This method is an alternative to {@link #getDataCopy}.
446      *
447      * @return the string representation of the valid data range
448      */
449     public String   toString() {
450       return _source.subSequence(_startPosition, _startPosition + _length).toString();
451     }
452 
453     /**
454      * Setting the data source of this instance.
455      */
456     protected void setData(CharSequenceTokenizerSource source) {
457       _source = source;
458     }
459 
460     // Members
461     private CharSequenceTokenizerSource _source;
462   }
463 
464   
465   //---------------------------------------------------------------------------
466   // Class members
467   //
468   
469   /**
470    * Buffer sizes
471    */
472   private static final int SMALL_BUFFER_INITSIZE = 0x2000;    // 8K
473   private static final int LARGE_BUFFER_INITSIZE = 0x10000;   // 64K
474   
475   
476   //---------------------------------------------------------------------------
477   // Members
478   //
479   
480   /**
481    * This buffer holds the currently read data. Dont use a buffered reader, since
482    * we do buffering here.
483    */
484   protected char[] _inputBuffer = null;
485 
486   /**
487    * Mapping of index 0 of {@link #_inputBuffer} to the absolute start of the 
488    * input stream.
489    */
490   protected int _rangeStart = 0;
491 
492   /**
493    * Flag used in conjunction with the {@link #_charSequenceTokenizerSource}.
494    */
495   protected boolean _hasBeenRead = false;
496   
497   /**
498    * If a {@link CharSequenceTokenizerSource} is used, this member is set to
499    * it.
500    */
501   protected CharSequenceTokenizerSource _charSequenceTokenizerSource = null;
502   
503   /**
504    * The {@link de.susebox.jtopas.spi.DataProvider} instance for this object.
505    * This instance is kept due to a significant performance boost compared with
506    * construction of a <code>DataProvider</code> every time {@link #getDataProvider}
507    * is called.
508    */
509   protected AbstractDataProvider _dataProvider = null;
510 }
511
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags