KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > StandardTokenizer


1 /*
2  * StandardTokenizer.java: core class for lexical parser.
3  *
4  * Copyright (C) 2001 Heiko Blau
5  *
6  * This file belongs to the JTopas Library.
7  * JTopas is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your
10  * option) any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with JTopas. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * Contact:
28  * email: heiko@susebox.de
29  */

30
31 package de.susebox.jtopas;
32
33 //-----------------------------------------------------------------------------
34
// Imports
35
//
36
import de.susebox.java.lang.ExtIndexOutOfBoundsException;
37 import de.susebox.jtopas.spi.DataProvider;
38
39
40 //-----------------------------------------------------------------------------
41
// Class StandardTokenizer
42
//
43

44 /**<p>
45  * This is the mainstream {@link Tokenizer}. It implements the {@link Tokenizer}
46  * interface in a straightforward approach without too specialized parse
47  * optimizations.
48  * </p><p>
49  * Beside the {@link Tokenizer} interface, the class <code>StandardTokenizer</code>
50  * provides some basic features for cascading (nested) tokenizers. Consider the usual
51  * HTML pages found today in the WWW. Most of them are a mixture of regular HTML,
52  * cascading style sheets (CSS) and embedded JavaScript. These different languages
53  * use different syntaxes, so one needs varous tokenizers on the same input stream.
54  *</p><p>
55  * This {@link Tokenizer} implementation is not synchronized. Take care when using
56  * with multible threads.
57  *</p>
58  *
59  * @see Tokenizer
60  * @see TokenizerProperties
61  * @author Heiko Blau
62  */

63 public class StandardTokenizer
64   extends AbstractTokenizer
65   implements Tokenizer, TokenizerPropertyListener
66 {
67   //---------------------------------------------------------------------------
68
// Constructors
69
//
70

71   /**
72    * Default constructor that sets the tokenizer control flags as it would be
73    * approbriate for C/C++ and Java. Found token images are copied. No line nor
74    * column informations are provided. Nested comments are not allowed.
75    *<br>
76    * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES}
77    * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and
78    * separator handling.
79    */

80   public StandardTokenizer() {}
81   
82   /**
83    * Contructing a <code>StandardTokenizer</code> with a backing {@link TokenizerProperties}
84    * instance.
85    *
86    * @param properties an {@link TokenizerProperties} object containing the
87    * settings for the tokenizing process
88    */

89   public StandardTokenizer(TokenizerProperties properties) {
90     super.setTokenizerProperties(properties);
91   }
92
93   
94   //---------------------------------------------------------------------------
95
// Methods of the Tokenizer interface
96
//
97

98   /**
99    * This method returns the absolute offset in characters to the start of the
100    * parsed stream. See the method description in {@link Tokenizer}.
101    *
102    * @return the absolute offset of the current text window in characters from
103    * the start of the data source of the Tokenizer
104    * @see #getReadPosition
105    */

106   public int getRangeStart() {
107     return _rangeStart;
108   }
109   
110   /**
111    * Additionally to the common behaviour implemented in
112    * {@link #de.susebox.jtopas.AbstractTokenizer#setSource}, this method ajusts
113    * the state speicific to the <code>StandardTokenizer</code> class.
114    *
115    * @param source a {@link TokenizerSource} to read data from
116    */

117   public void setSource(TokenizerSource source) {
118     super.setSource(source);
119     _hasBeenRead = false;
120     _rangeStart = 0;
121     try {
122       _charSequenceTokenizerSource = (CharSequenceTokenizerSource)getSource();
123       _dataProvider = new StringDataProvider(_charSequenceTokenizerSource, 0, 0);
124     } catch (ClassCastException JavaDoc ex) {
125       _charSequenceTokenizerSource = null;
126       _dataProvider = new CharArrayDataProvider(_inputBuffer, 0, 0);
127     }
128   }
129
130   /**
131    * Closing this tokenizer frees resources.
132    */

133   public void close() {
134     _inputBuffer = null;
135     _rangeStart = 0;
136     _hasBeenRead = false;
137     _charSequenceTokenizerSource = null;
138     _dataProvider = null;
139     super.close();
140   }
141   
142   
143   //---------------------------------------------------------------------------
144
// Implementation
145
//
146

147   /**
148    * Implements the abstract method of the base class.
149    *
150    * @param startPos position in the input data
151    * @param length number of characters
152    */

153   protected DataProvider getDataProvider(int startPos, int length) {
154     _dataProvider.setDataRange(startPos - getRangeStart(), length);
155     return _dataProvider;
156   }
157
158   /**
159    * This method organizes the input buffer. It moves the current text window if
160    * nessecary or allocates more space, if data should be kept completely (see the
161    * {@link TokenizerProperties#F_KEEP_DATA} flag).
162    * Its main purpose is to call the {@link TokenizerSource#read} method.
163    *
164    * @return number of read bytes or -1 if an end-of-file condition occured
165    * @throws TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
166    * method
167    */

168   protected int readMoreData() throws TokenizerException {
169     if (_charSequenceTokenizerSource != null) {
170       // new CharSequenceTokenizerSource
171
if (_hasBeenRead || _charSequenceTokenizerSource.length() <= 0) {
172         return -1;
173       } else {
174         _hasBeenRead = true;
175         return _charSequenceTokenizerSource.length();
176       }
177     
178     } else {
179       // no input buffer so far
180
if (_inputBuffer == null) {
181         if (isFlagSet(Flags.F_KEEP_DATA)) {
182           _inputBuffer = new char[LARGE_BUFFER_INITSIZE]; // 64k
183
} else {
184           _inputBuffer = new char[SMALL_BUFFER_INITSIZE]; // 8k
185
}
186         ((CharArrayDataProvider)_dataProvider).setData(_inputBuffer);
187       }
188
189       // this is a good moment to move already read data if the write position is
190
// near the end of the buffer and there is a certain space before the current
191
// read position
192
int readPos = getReadPosition() - getRangeStart();
193       int writePos = currentlyAvailable();
194
195       if ( ! isFlagSet(Flags.F_KEEP_DATA)) {
196         if ((readPos > _inputBuffer.length / 4) && (writePos > (3 * _inputBuffer.length) / 4)) {
197           reorganizeInputBuffer(_inputBuffer);
198           writePos = currentlyAvailable();
199         }
200       }
201
202       // if there is no space any more and data couldn't be moved (see above)
203
// we need a new input buffer
204
if (writePos >= _inputBuffer.length) {
205         _inputBuffer = reorganizeInputBuffer(new char[_inputBuffer.length * 2]);
206         writePos = currentlyAvailable();
207         ((CharArrayDataProvider)_dataProvider).setData(_inputBuffer);
208       }
209
210       // read data
211
int chars = 0;
212
213       while (chars == 0) {
214         try {
215           chars = getSource().read(_inputBuffer, writePos, _inputBuffer.length - writePos);
216         } catch (Exception JavaDoc ex) {
217           throw new TokenizerException(ex);
218         }
219       }
220       return chars;
221     }
222   }
223   
224   /**
225    * Move data in the input buffer and adjust various position values.
226    */

227   private char[] reorganizeInputBuffer(char[] newBuffer) {
228     int readPos = getReadPosition() - getRangeStart();
229     int writePos = currentlyAvailable();
230
231     if ( ! isFlagSet(Flags.F_KEEP_DATA)) {
232       System.arraycopy(_inputBuffer, readPos, newBuffer, 0, writePos - readPos);
233       _rangeStart += readPos;
234     } else {
235       System.arraycopy(_inputBuffer, 0, newBuffer, 0, writePos);
236     }
237     return newBuffer;
238   }
239
240   
241   //---------------------------------------------------------------------------
242
// Inner classes
243
//
244

245   /**
246    * Base class for the various implementations of the
247    * {@link de.susebox.jtopas.spi.DataProvider} interface for the {@link StandardTokenizer}.
248    */

249   private abstract class AbstractDataProvider implements DataProvider {
250
251     /**
252      * The constructor takes the nessecary parameters for the methods defined
253      * below
254      *
255      * @param startPosition valid data start here
256      * @param length count of characters starting at startPosition
257      */

258     public AbstractDataProvider(int startPosition, int length) {
259       setDataRange(startPosition, length);
260     }
261
262     /**
263      * Retrieving the position where the data to analyze start in the buffer provided
264      * by {@link #getData}. The calling {@link de.susebox.jtopas.spi.DataMapper}
265      * must not access data prior to this index in the character array.
266      *
267      * @return index in the character array returned by {@link #getData}, where data starts
268      */

269     public int getStartPosition() {
270       return _startPosition;
271     }
272
273     /**
274      * Retrieving the maximum number of characters in the array provided by {@link getData}
275      * that can be analyzed by the calling {@link de.susebox.jtopas.spi.DataMapper}.
276      *
277      * @param testChar check this character
278      * @return <code>true</code> if the given character is a separator,
279      * <code>false</code> otherwise
280      */

281     public int getLength() {
282       return _length;
283     }
284
285     /**
286      * Setting the start position and the length in the data buffer of this
287      * instance.
288      *
289      * @param startPosition valid data start here
290      * @param length count of characters starting at startPosition
291      */

292     protected void setDataRange(int startPosition, int length) {
293       _startPosition = startPosition;
294       _length = length;
295     }
296
297     // Members
298
protected int _startPosition;
299     protected int _length;
300   }
301
302   /**
303    * Implementation of the {@link de.susebox.jtopas.spi.DataProvider} interface
304    * for the {@link StandardTokenizer}.
305    */

306   private final class CharArrayDataProvider extends AbstractDataProvider implements DataProvider {
307
308     /**
309      * The constructor takes the nessecary parameters for the methods defined
310      * below
311      */

312     public CharArrayDataProvider(char[] data, int startPosition, int length) {
313       super(startPosition, length);
314       setData(data);
315       _dataAsString = null;
316     }
317
318     /**
319      * See {@link de.susebox.jtopas.spi.DataProvider#getCharAt} for details.
320      *
321      * @param index an index between 0 and {@link #getLength}
322      * @return the character at the given position
323      */

324     public char getCharAt(int index) {
325       return _data[_startPosition + index];
326     }
327
328     /**
329      * See {@link de.susebox.jtopas.spi.DataProvider#getData} for details.
330      *
331      * @return the character buffer to read data from
332      */

333     public char[] getData() {
334       return _data;
335     }
336
337     /**
338      * See {@link de.susebox.jtopas.spi.DataProvider#getDataCopy} for details.
339      *
340      * @return a copy of the valid data of this {@link DataProvider}
341      * @see #getData
342      * @see #toString
343      */

344     public char[] getDataCopy() {
345       char[] copy = new char[getLength()];
346
347       System.arraycopy(_data, getStartPosition(), copy, 0, copy.length);
348       return copy;
349     }
350
351     /**
352      * Returning the valid data range of this <code>DataProvider</code> as a string.
353      * This method is an alternative to {@link #getDataCopy}.
354      *
355      * @return the string representation of the valid data range
356      */

357     public String JavaDoc toString() {
358       if (_dataAsString == null) {
359         if (_data != null) {
360           _dataAsString = new String JavaDoc(_data, _startPosition, _length);
361         } else {
362           _dataAsString = "";
363         }
364       }
365       return _dataAsString;
366     }
367
368     /**
369      * Setting the data buffer of this instance.
370      */

371     protected void setData(char[] data) {
372       _data = data;
373     }
374
375     /**
376      * Setting the start position and the length in the data buffer of this
377      * instance.
378      *
379      * @param startPosition valid data start here
380      * @param length count of characters starting at startPosition
381      */

382     protected void setDataRange(int startPosition, int length) {
383       super.setDataRange(startPosition, length);
384       _dataAsString = null;
385     }
386     
387     // Members
388
private char[] _data;
389     private String JavaDoc _dataAsString;
390   }
391
392
393   /**
394    * Implementation of the {@link de.susebox.jtopas.spi.DataProvider}
395    * interface for {@link CharSequenceTokenizerSource} sources.
396    */

397   private final class StringDataProvider extends AbstractDataProvider implements DataProvider {
398
399     /**
400      * The constructor takes the nessecary parameters for the methods defined
401      * below
402      */

403     public StringDataProvider(CharSequenceTokenizerSource source, int startPosition, int length) {
404       super(startPosition, length);
405       setData(source);
406     }
407
408
409     //---------------------------------------------------------------------------
410
// methods of the DataProvider interface
411
//
412

413     /**
414      * See {@link de.susebox.jtopas.spi.DataProvider#getCharAt} for details.
415      *
416      * @param index the index of the character starting from {@link #getStartPosition}
417      * @return the character at the given position
418      */

419     public char getCharAt(int index) {
420       return _source.charAt(_startPosition + index);
421     }
422
423     /**
424      * See {@link de.susebox.jtopas.spi.DataProvider#getData} for details.
425      *
426      * @return the character buffer to read data from
427      */

428     public char[] getData() {
429       return _source.toString().toCharArray();
430     }
431
432     /**
433      * See {@link de.susebox.jtopas.spi.DataProvider#getDataCopy} for details.
434      *
435      * @return a copy of the valid data of this {@link DataProvider}
436      * @see #getData
437      * @see #toString
438      */

439     public char[] getDataCopy() {
440       return toString().toCharArray();
441     }
442
443     /**
444      * Returning the valid data range of this <code>DataProvider</code> as a string.
445      * This method is an alternative to {@link #getDataCopy}.
446      *
447      * @return the string representation of the valid data range
448      */

449     public String JavaDoc toString() {
450       return _source.subSequence(_startPosition, _startPosition + _length).toString();
451     }
452
453     /**
454      * Setting the data source of this instance.
455      */

456     protected void setData(CharSequenceTokenizerSource source) {
457       _source = source;
458     }
459
460     // Members
461
private CharSequenceTokenizerSource _source;
462   }
463
464   
465   //---------------------------------------------------------------------------
466
// Class members
467
//
468

469   /**
470    * Buffer sizes
471    */

472   private static final int SMALL_BUFFER_INITSIZE = 0x2000; // 8K
473
private static final int LARGE_BUFFER_INITSIZE = 0x10000; // 64K
474

475   
476   //---------------------------------------------------------------------------
477
// Members
478
//
479

480   /**
481    * This buffer holds the currently read data. Dont use a buffered reader, since
482    * we do buffering here.
483    */

484   protected char[] _inputBuffer = null;
485
486   /**
487    * Mapping of index 0 of {@link #_inputBuffer} to the absolute start of the
488    * input stream.
489    */

490   protected int _rangeStart = 0;
491
492   /**
493    * Flag used in conjunction with the {@link #_charSequenceTokenizerSource}.
494    */

495   protected boolean _hasBeenRead = false;
496   
497   /**
498    * If a {@link CharSequenceTokenizerSource} is used, this member is set to
499    * it.
500    */

501   protected CharSequenceTokenizerSource _charSequenceTokenizerSource = null;
502   
503   /**
504    * The {@link de.susebox.jtopas.spi.DataProvider} instance for this object.
505    * This instance is kept due to a significant performance boost compared with
506    * construction of a <code>DataProvider</code> every time {@link #getDataProvider}
507    * is called.
508    */

509   protected AbstractDataProvider _dataProvider = null;
510 }
511
Popular Tags