Token


1   /*
2    * Token.java: Token for parsers etc.
3    *
4    * Copyright (C) 2002 Heiko Blau
5    *
6    * This file belongs to the Susebox Java Core Library (Susebox JCL).
7    * The Susebox JCL is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your 
10   * option) any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along
18   * with the Susebox JCL. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * Contact:
28   *   email: heiko@susebox.de 
29   */
30  
31  package de.susebox.jtopas;
32  
33  //-----------------------------------------------------------------------------
34  // Class Token
35  //
36  
37  /**<p>
38   * Instances of this class are created by the classes implementing the 
39   * {@link Tokenizer} interface. <code>Token</code> describes a portion of text 
40   * according to the settings given to the producing {@link Tokenizer} in form of 
41   * a {@link TokenizerProperties} object. Beside the token type the token image 
42   * itself, its position in the input stream, line and column position and associated 
43   * informations can be obtained from the <code>Token</code> (provided, the nessecary
44   * parse flags are set in the tokenizer).
45   *</p><p>
46   * This class replaces the older {@link de.susebox.java.util.Token} which is
47   * deprecated.
48   *</p>
49   *
50   * @author  Heiko Blau
51   * @see     Tokenizer
52   * @see     TokenizerProperties
53   */
54  public class Token {
55    
56    //---------------------------------------------------------------------------
57    // constants (token types)
58    //
59  
60    /**
61     * The token is nothing special (no keyword, no whitespace, etc.).
62     */  
63    public static final byte NORMAL = 0;
64  
65    /**
66     * The token is a keyword registered with the used {@link Tokenizer}.
67     */  
68    public static final byte KEYWORD = 1;
69  
70    /**
71     * The token is one of the quoted strings known to the {@link Tokenizer}. In Java
72     * this would be for instance a "String" or a 'c' (haracter).
73     */  
74    public static final byte STRING = 2;
75    
76    /**
77     * The token matches a pattern. This can be a number od identifier pattern for 
78     * instance.
79     */  
80    public static final byte PATTERN = 3;
81  
82    /**
83     * Special sequences are characters or character combinations that have a certain
84     * meaning to the parsed language or dialect. In computer languages we have for
85     * instance operators, end-of-statement characters etc.
86     * A companion might have been associated with a special sequence. It probably
87     * contains information important to the user of the <code>Token</code>.
88     */  
89    public static final byte SPECIAL_SEQUENCE = 4;
90    
91    /** 
92     * Separators are otherwise not remarkable characters. An opening parenthesis 
93     * might be nessecary for a syntactically correct text, but without any special 
94     * meaning to the compiler, interpreter etc. after it has been detected.
95     */  
96    public static final byte SEPARATOR = 5;
97    
98    /** 
99     * Whitespaces are portions of the text, that contain one or more characters 
100    * that separate the significant parts of the text. Generally, a sequence of 
101    * whitespaces is equally represented by one single whitespace character. That 
102    * is the difference to separators.
103    */  
104   public static final byte WHITESPACE = 6;
105 
106   /**
107    * Although a line comment is - in most cases - actually a whitespace sequence, it
108    * is often nessecary to handle it separately. Syntax hilighting is a thing that
109    * needs to know a line comment.
110    */  
111   public static final byte LINE_COMMENT = 7;
112 
113   /**
114    * Block comments are also a special form of a whitespace sequence. See 
115    * {@link #LINE_COMMENT} for details.
116    */  
117   public static final byte BLOCK_COMMENT = 8;
118 
119   /**
120    * A token of the type <code>EOF</code> is used to indicate an end-of-line condition
121    * on the input stream of the tokenizer.
122    */  
123   public static final byte EOF = -1;
124   
125   /**
126    * This is for the leftovers of the lexical analysis of a text.
127    */  
128   public static final byte UNKNOWN = -2;
129     
130 
131   //---------------------------------------------------------------------------
132   // Getter- und Setter-Methoden
133   //
134   
135   /**
136    * Setting the type property of the <code>Token</code>. This is one of the constants
137    * defined in this class.
138    *
139    * @param type the token type
140    * @see   #getType
141    */  
142   public void setType(int type) {
143     _type = type;
144   }
145     
146   /**
147    * Obtaining the type of the <code>Token</code>. This is one of the constants
148    * defined in the <code>Token</code> class.
149    *
150    * @return the token type
151    * @see   #setType
152    */  
153   public int getType() {
154     return _type;
155   }
156     
157   /**
158    * Setting the token image. Note that some {@link Tokenizer} only fill position 
159    * and length information rather than setting the token image. This strategy 
160    * might have a tremendous influence on the parse performance and the memory 
161    * allocation.
162    *
163    * @param image   the token image
164    * @see   #getImage
165    */  
166   public void setImage(String   image) {
167     if ((_image = image) == null) {
168       _length = 0;
169     } else {
170       _length = _image.length();
171     }
172   }
173     
174   /**
175    * Obtaining the token image as a {@link java.lang.String}. Th method returns
176    * <code>null</code> when called on an end-of-file token or if the {@link Tokenizer} 
177    * producing this <code>Token</code> object, is configured to return only 
178    * position informations (see {@link TokenizerProperties#F_TOKEN_POS_ONLY}).
179    *
180    * @return the token image as a {@link java.lang.String} (<code>null</code> is possible).
181    * @see   #setImage
182    */  
183   public String   getImage() {
184     return _image;
185   }
186   
187   /**
188    * Image parts are substrings of a token image. The operation returns a meaningful
189    * result only, if the flag {@link TokenizerProperties#F_RETURN_IMAGE_PARTS} is
190    * set for the <code>TokenizerProperties</code>, the {@link Tokenizer} or the
191    * {@link TokenizerProperty} that "produced" the token. If that flag is not set
192    * the return value is identical to {@link #getImage}.
193    *<br>
194    * Number and contents of the image parts depend on the token type:
195    *<ul><li>
196    *    {@link #NORMAL}, {@link #KEYWORD}, {@link #SPECIAL_SEQUENCE}, 
197    *    {@link #SEPARATOR}: These token have one image part that is identical to 
198    *    the image itself ({@link #getImage}).
199    *</li><li>
200    *    {@link #WHITESPACE}: Whitespaces have one image part for each substring
201    *    on a single line without any line separators. For whitespace sequences 
202    *    without line separators there will be one part that is identical to the 
203    *    image itself ({@link #getImage}). More generally, whitespaces have 
204    *    <code>separatorCount + 1</code> image parts. For multi-line whitespaces 
205    *    some or all of these image parts can be empty.
206    *</li><li>
207    *    {@link #STRING}: One image part per line containing the characters between 
208    *    and excluding the string start and end sequences and/or the line 
209    *    separators, equivalent to the handling of whitespaces. The string escape 
210    *    sequences are resolved. For instance, the image part of the SQL string 
211    *    <code>'select ''hello'' from dual'</code> is <code>select 'hello' from dual</code>. 
212    *    Multiline strings may have empty image parts (if emtpy lines are included 
213    *    in the string). The string "line1\n" has two image parts: "line1" and the
214    *    empty string (since the string ends on a new line). The string "\nline2"
215    *    has also two image parts: the empty string and "line2" (since the string 
216    *    starts on one line and ends on the next).
217    *</li><li>
218    *    {@link #PATTERN}: a pattern has image parts according to the groups defined
219    *    in the regular expression of the pattern. The {@link java.util.regex.Pattern}
220    *    class speaks of "Capturing groups" that are expressions in parentheses.
221    *    Image parts are especially important for pattern token, where the access
222    *    to parts of the pattern is usually nessecary. For instance, in Java Unicode
223    *    characters can be written in form of <code>"\\u[0-9A-Fa-f]{4}"</code> 
224    *    pattern. For further processing the hexadecimal part must be accessed.
225    *    By using the pattern <code>"\\u([0-9A-Fa-f]{4})"</code>, a token containing
226    *    the unicode notation <code>"\\u00AC"</code> has the two image parts 
227    *    <code>"\\u00AC"</code> (capturing group 0) and <code>"00AC"</code>
228    *    (capturing group 1).
229    *</li><li>
230    *    {@link #LINE_COMMENT}: Line comments have one image part that contains
231    *    the substring after the line comment start sequence up to and excluding
232    *    the line separator sequence.
233    *</li><li>
234    *    {@link #BLOCK_COMMENT}: Like whitespaces and string, block comments have 
235    *    one image part per line they are spanning. The first part is without the 
236    *    block comment start sequence, the last without the block comment end 
237    *    sequence. The line separator sequences are also not included in the parts.
238    *</li><li>
239    *    {@link #EOF}: The method returns an empty array.
240    *</li></ul>
241    * The return value is an array of strings rather than an {@link java.util.Enumeration}
242    * or {@link java.util.Iterator}, since it can be used more easily and contains
243    * only one element in a lot if not most cases.
244    *
245    * @return  an array of image parts according to the token type if the flag 
246    *          {@link TokenizerProperties#F_RETURN_IMAGE_PARTS} is set or containing 
247    *          the image itself otherwise ({@link #getImage}).
248    */
249   public String  [] getImageParts() {
250     if (_imageParts != null) {
251       return _imageParts;
252     } else {
253       return new String  [] { getImage() };
254     }
255   }
256   
257   /**
258    * The counterpart to {@link #getImageParts}. It sets all image parts in one
259    * operation. The method accepts <code>null</code> and empty arrays.
260    *
261    * @param imageParts  an array of image parts according to the token type or
262    *                    <code>null</code>
263    */
264   public void setImageParts(String  [] imageParts) {
265     _imageParts = imageParts;
266   }
267     
268   /**
269    * Setting the length of the token. Some {@link Tokenizer} may prefer or may be
270    * configured not to return a token image, but only the position and length
271    * informations. This may save a lot of time whereever only a subset of the found
272    * tokens are actually needed by the user.
273    *<br>
274    * This method is an alternative to {@link #setEndPosition} depending on which
275    * information is at hand or easier to obtain for the {@link Tokenizer} producing
276    * this <code>Token</code>.
277    *<br>
278    * Note that this method is implicitely called by {@link #setImage} and 
279    * {@link #setEndPosition}.
280    *
281    * @param length the length of the token
282    * @see   #getLength
283    * @see   #setEndPosition
284    */  
285   public void setLength(int length) {
286     _length = length;
287   }
288     
289   /**
290    * Obtaining the length of the token. Note that some token types have a zero length
291    * (like EOF or UNKNOWN).
292    *
293    * @return the length of the token.
294    * @see   #setLength
295    * @see   #getEndPosition
296    */  
297   public int getLength() {
298     return _length;
299   }
300     
301   /**
302    * Some token may have associated informations for the user of the <code>Token</code>.
303    * A popular thing would be the association of an integer constant to a special
304    * sequence or keyword to be used in fast <code>switch</code> statetents.
305    *
306    * @param companion the associated information for this token
307    */  
308   public void setCompanion(Object   companion) {
309     _companion = companion;
310   }
311     
312   /**
313    * Obtaining the associated information of the token. Can be <code>null</code>. See
314    * {@link #setCompanion} for details.
315    *
316    * @return the associated information of this token
317    */  
318   public Object   getCompanion() {
319     return _companion;
320   }
321   
322   /**
323    * Setting the start position of the token relative to the start of the input 
324    * stream. For instance, the first character in a file has the start position 
325    * 0.
326    *
327    * @param startPosition the position where the token starts in the input stream.
328    * @see   #getStartPosition
329    * @see   #setEndPosition
330    */  
331   public void setStartPosition(int startPosition) {
332     _startPosition = startPosition;
333   }
334     
335   /**
336    * Obtaining the starting position of the token. If not set or not of interest, 
337    * -1 is returned.
338    *
339    * @return  start position of the token.
340    * @see     #setStartPosition
341    * @see     #getEndPosition
342    */  
343   public int getStartPosition() {
344     return _startPosition;
345   }
346     
347   /**
348    * Setting the end position of the token relative to the start of the input 
349    * stream. For instance, the first character in a file has the start position 
350    * 0. The character at the given end position is <strong>NOT</code> part of
351    * this <code>Token</code>. This is the same principle as in the 
352    * {@link java.lang.String#substring(int, int)} method.
353    *<br>
354    * This method is an alternative to {@link #setLength} depending on which
355    * information is at hand or easier to obtain for the {@link Tokenizer} producing
356    * this <code>Token</code>.
357    *<br>
358    * Note that this method <strong>MUST</strong> be called after {@link #setStartPosition}
359    * since it affects the length of the token. Its effect is in turn eliminated
360    * by calls to {@link #setLength} and {@link #setImage}
361    *
362    * @param endPosition   the position where the token ends in the input stream.
363    */  
364   public void setEndPosition(int endPosition) {
365     setLength(endPosition - _startPosition);
366   }
367     
368   /**
369    * Obtaining the end position of this token. Note that the return value of this
370    * method is only valid, if {@link #setStartPosition} has been called and one
371    * of the methods {@link #setImage}, {@link #setLength} or {@link #setEndPosition}.
372    *
373    * @return  end position of the token.
374    * @see     #setEndPosition
375    * @see     #setStartPosition
376    * @see     #getStartPosition
377    */  
378   public int getEndPosition() {
379     return getLength() - getStartPosition();
380   }
381     
382   /**
383    * In {@link Tokenizer}'s counting lines and columns, this method is used to 
384    * set the line number where the beginning of the <code>Token</code> was found.
385    * Line numbers start with 0.
386    *
387    * @param lineno line number where the token begins
388    * @see   #getStartLine
389    */  
390   public void setStartLine(int lineno) {
391     _startLine = lineno;
392   }
393     
394   /**
395    * Obtaining the line number where the <code>Token</code> starts. See also
396    * {@link #setStartLine} for details.<br>
397    * If a tokenizer doesn't count lines and columns, the returned value is -1.
398    *
399    * @return  the line number where the token starts or -1, if no line counting is
400    *          performed
401    * @see     #setStartLine
402    */  
403   public int getStartLine() {
404     return _startLine;
405   }
406     
407   /**
408    * In {@link Tokenizer}'s counting lines and columns, this method is used to 
409    * set the column number where the beginning of the <code>Token</code> was 
410    * found. Column numbers start with 0.
411    *
412    * @param colno number where the token begins
413    * @see   #getStartColumn
414    */  
415   public void setStartColumn(int colno) {
416     _startColumn = colno;
417   }
418     
419   /**
420    * Obtaining the column number of the <code>Token</code> start. See {@link #setStartColumn}
421    * for details.<br>
422    * If a tokenizer doesn't count lines and columns, the returned value is -1.
423    *
424    * @return  the column number where the token starts or -1, if no line counting 
425    *          is performed
426    * @see     #setStartColumn
427    */  
428   public int getStartColumn() {
429     return _startColumn;
430   }
431     
432   /**
433    * In {@link Tokenizer}'s counting lines and columns, this method is used to 
434    * set the line number where the end of the <code>Token</code> was found. 
435    * See {@link #setStartLine} for more.<br>
436    * The end line number is the one there the first character was found that does
437    * <b><i>NOT</i></b> belongs to the token. This approach is choosen in accordance
438    * to the toIndex parameters in {@link java.lang.String#substring(int, int)}.
439    *
440    * @param lineno line number where the token ends
441    */  
442   public void setEndLine(int lineno) {
443     _endLine = lineno;
444   }
445     
446   /**
447    * Obtaining the line number where the token ends. See {@link #setEndLine} for 
448    * more. If a tokenizer doesn't count lines and columns, the returned value is 
449    * -1.
450    *
451    * @return  line number where the token ends or -1, if no line counting is
452    *          performed
453    * @see     #setEndLine
454    */  
455   public int getEndLine() {
456     return _endLine;
457   }
458     
459   /**
460    * In {@link Tokenizer}'s counting lines and columns, this method is used to set the
461    * column number where the end of the <code>Token</code> was found.<br>
462    * The end column number is the one of the first character that does
463    * <b><i>NOT</i></b> belongs to the token. This approach is choosen in accordance
464    * to the toIndex parameters in {@link java.lang.String#substring(int, int)}.
465    *
466    * @param colno column number where the token ends
467    */  
468   public void setEndColumn(int colno) {
469     _endColumn = colno;
470   }
471     
472   /**
473    * Obtaining the column number where the <code>Token</code> ends. See {@link #setEndColumn}
474    * for more.<br>
475    * If a tokenizer doesn't count lines and columns, the returned value is -1.
476    *
477    * @return  column number where the token ends or -1, if no line counting is
478    *          performed
479    * @see     #setEndColumn
480    */  
481   public int getEndColumn() {
482     return _endColumn;
483   }
484     
485  
486   //---------------------------------------------------------------------------
487   // construction
488   //
489   
490   /**
491    * Default constructor.
492    */  
493   public Token() {
494     this(UNKNOWN, null, null);
495   }
496   
497   /**
498    * Constructs a token of a given type. Only the type of the token is known but not
499    * its image or positions.
500    *
501    * @param type token type, one of the class constants.
502    */  
503   public Token(int type) {
504     this(type, null, null);
505   }
506   
507   /**
508    * Construct a token of a given type with the given image. No position information
509    * is given.
510    *
511    * @param type  token type, one of the class constants.
512    * @param image the token image itself
513    */  
514   public Token(int type, String   image) {
515     this(type, image, null);
516   }
517   
518   /**
519    * Construct a token of a given type with the given image and a companion. This
520    * constructor is most useful for keywords or special sequences.
521    *
522    * @param type      token type, one of the class constants.
523    * @param image     the token image itself
524    * @param companion an associated information of the token type
525    */  
526   public Token(int type, String   image, Object   companion) {
527     setType(type);
528     setImage(image);
529     setCompanion(companion);
530     setStartPosition(-1);
531     setStartLine(-1);
532     setStartColumn(-1);
533     setEndLine(-1);
534     setEndColumn(-1);
535     setImageParts(null);
536   }
537   
538 
539   //---------------------------------------------------------------------------
540   // overloaded methods
541   //
542   
543   /** 
544    * Implementation of the well known method {@link java.lang.Object#equals}.
545    * Note that two token are equal if every member of it is equal. That means
546    * that token retrieved by two different {@link Tokenizer} instances can be
547    * equal.
548    *
549    * @param   object  the {@link java.lang.Object} to compare
550    * @return <code>true</code> if two token are equal, <code>false</code>
551    *          otherwise
552    */
553   public boolean equals(Object   object) {
554     // Test on intentical objects and incompatible classes
555     if (object == null) {
556       return false;
557     } else if (object == this) {
558       return true;
559     } else if (object.getClass() != getClass()) {
560       return false;
561     }
562     
563     // real check
564     Token other = (Token)object;
565       
566     if (getType() != other.getType()) {
567       return false;
568     } else if (getStartPosition() != other.getStartPosition()) {
569       return false;
570     } else if (getLength() != other.getLength()) {
571       return false;
572     } else if (getStartLine() != other.getStartLine()) {
573       return false;
574     } else if (getStartColumn() != other.getStartColumn()) {
575       return false;
576     } else if (getEndLine() != other.getEndLine()) {
577       return false;
578     } else if (getEndColumn() != other.getEndColumn()) {
579       return false;
580     } else if (   (getCompanion() == null && other.getCompanion() != null)
581                || (getCompanion() != null && ! getCompanion().equals(other.getCompanion()))) {
582       return false;
583     } else if (   (getImage() == null && other.getImage() != null)
584                || (getImage() != null && ! getImage().equals(other.getImage()))) {
585       return false;
586     }
587     return true;
588   }
589   
590   /** 
591    * Implementation of the well known method {@link java.lang.Object#toString}.
592    *
593    * @return string representation of this object
594    */
595   public String   toString() {
596     StringBuffer   buffer = new StringBuffer  ();
597     
598     // Type
599     buffer.append("Type ");
600     buffer.append(Token.getTypeName(getType()));
601     
602     // Image
603     if (getType() != EOF) {
604       buffer.append(":  ");
605       if (getImage() != null) {
606         buffer.append('"');
607         buffer.append(getImage());
608         buffer.append('"');
609       } else {
610         buffer.append("no image, length ");
611         buffer.append(getLength());
612       }
613     }
614     return buffer.toString();
615   }
616 
617   /**
618    * Getting a type name for displaying. The methode never fails even if the
619    * given type is unknown.
620    *
621    * @param type  one of the Token type constants
622    * @return a string representation of the given type constant
623    */
624   public static String   getTypeName(int type) {
625     switch (type) {
626     case NORMAL:
627       return "NORMAL";
628     case KEYWORD:
629       return "KEYWORD";
630     case STRING:
631       return "STRING";
632     case PATTERN:
633       return "PATTERN";
634     case SPECIAL_SEQUENCE:
635       return "SPECIAL_SEQUENCE";
636     case SEPARATOR:
637       return "SEPARATOR";
638     case WHITESPACE:
639       return "WHITESPACE";
640     case LINE_COMMENT:
641       return "LINE_COMMENT";
642     case BLOCK_COMMENT:
643       return "BLOCK_COMMENT";
644     case EOF:
645       return "EOF";
646     default:
647       return "UNKNOWN";
648     }
649   }
650   
651   
652   //---------------------------------------------------------------------------
653   // members
654   //
655   
656   /**
657    * The token type. Usually one of the constants {@link #NORMAL}, {@link #EOF} etc.
658    *
659    * @see #getType
660    * @see #setType
661    */
662   protected int _type;
663 
664   /**
665    * The string representing the token. This member might not be present if a
666    * {@link Tokenizer} is configured not to return token images.
667    *
668    * @see #getImage
669    * @see #setImage
670    */
671   protected String   _image;
672 
673   /**
674    * The length of the string representing the token..
675    *
676    * @see #getLength
677    * @see #setLength
678    */
679   protected int _length;
680 
681   /**
682    * An information associated with the token. For instance, keywords can be
683    * distinguished using different companions for each keyword
684    *
685    * @see #getCompanion
686    * @see #setCompanion
687    * @see TokenizerProperties#addKeyword
688    */
689   protected Object   _companion;
690 
691   /**
692    * The absolute position where the token starts in the source of data.
693    *
694    * @see #getStartPosition
695    * @see #setStartPosition
696    */
697   protected int _startPosition;
698 
699   /**
700    * The line where the token starts in the source of data. This member may not 
701    * be set if a {@link Tokenizer} is configured not to return token line and 
702    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
703    *
704    * @see #getStartLine
705    * @see #setStartLine
706    */
707   protected int _startLine;
708 
709   /**
710    * The column where the token starts in the source of data. This member may not 
711    * be set if a {@link Tokenizer} is configured not to return token line and 
712    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
713    *
714    * @see #getStartColumn
715    * @see #setStartColumn
716    */
717   protected int _startColumn;
718 
719   /**
720    * The line where the token ends in the source of data. This member may not 
721    * be set if a {@link Tokenizer} is configured not to return token line and 
722    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
723    *
724    * @see #getEndLine
725    * @see #setEndLine
726    */
727   protected int _endLine;
728 
729   /**
730    * The column where the token ends in the source of data. This member may not 
731    * be set if a {@link Tokenizer} is configured not to return token line and 
732    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
733    *
734    * @see #getEndColumn
735    * @see #setEndColumn
736    */
737   protected int _endColumn;
738   
739   /**
740    * Array with the image parts. See {@link #getImageParts} for details.
741    */
742   protected String  [] _imageParts;
743 }
744
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags