KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > Token


1 /*
2  * Token.java: Token for parsers etc.
3  *
4  * Copyright (C) 2002 Heiko Blau
5  *
6  * This file belongs to the Susebox Java Core Library (Susebox JCL).
7  * The Susebox JCL is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your
10  * option) any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with the Susebox JCL. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * Contact:
28  * email: heiko@susebox.de
29  */

30
31 package de.susebox.jtopas;
32
33 //-----------------------------------------------------------------------------
34
// Class Token
35
//
36

37 /**<p>
38  * Instances of this class are created by the classes implementing the
39  * {@link Tokenizer} interface. <code>Token</code> describes a portion of text
40  * according to the settings given to the producing {@link Tokenizer} in form of
41  * a {@link TokenizerProperties} object. Beside the token type the token image
42  * itself, its position in the input stream, line and column position and associated
43  * informations can be obtained from the <code>Token</code> (provided, the nessecary
44  * parse flags are set in the tokenizer).
45  *</p><p>
46  * This class replaces the older {@link de.susebox.java.util.Token} which is
47  * deprecated.
48  *</p>
49  *
50  * @author Heiko Blau
51  * @see Tokenizer
52  * @see TokenizerProperties
53  */

54 public class Token {
55   
56   //---------------------------------------------------------------------------
57
// constants (token types)
58
//
59

60   /**
61    * The token is nothing special (no keyword, no whitespace, etc.).
62    */

63   public static final byte NORMAL = 0;
64
65   /**
66    * The token is a keyword registered with the used {@link Tokenizer}.
67    */

68   public static final byte KEYWORD = 1;
69
70   /**
71    * The token is one of the quoted strings known to the {@link Tokenizer}. In Java
72    * this would be for instance a "String" or a 'c' (haracter).
73    */

74   public static final byte STRING = 2;
75   
76   /**
77    * The token matches a pattern. This can be a number od identifier pattern for
78    * instance.
79    */

80   public static final byte PATTERN = 3;
81
82   /**
83    * Special sequences are characters or character combinations that have a certain
84    * meaning to the parsed language or dialect. In computer languages we have for
85    * instance operators, end-of-statement characters etc.
86    * A companion might have been associated with a special sequence. It probably
87    * contains information important to the user of the <code>Token</code>.
88    */

89   public static final byte SPECIAL_SEQUENCE = 4;
90   
91   /**
92    * Separators are otherwise not remarkable characters. An opening parenthesis
93    * might be nessecary for a syntactically correct text, but without any special
94    * meaning to the compiler, interpreter etc. after it has been detected.
95    */

96   public static final byte SEPARATOR = 5;
97   
98   /**
99    * Whitespaces are portions of the text, that contain one or more characters
100    * that separate the significant parts of the text. Generally, a sequence of
101    * whitespaces is equally represented by one single whitespace character. That
102    * is the difference to separators.
103    */

104   public static final byte WHITESPACE = 6;
105
106   /**
107    * Although a line comment is - in most cases - actually a whitespace sequence, it
108    * is often nessecary to handle it separately. Syntax hilighting is a thing that
109    * needs to know a line comment.
110    */

111   public static final byte LINE_COMMENT = 7;
112
113   /**
114    * Block comments are also a special form of a whitespace sequence. See
115    * {@link #LINE_COMMENT} for details.
116    */

117   public static final byte BLOCK_COMMENT = 8;
118
119   /**
120    * A token of the type <code>EOF</code> is used to indicate an end-of-line condition
121    * on the input stream of the tokenizer.
122    */

123   public static final byte EOF = -1;
124   
125   /**
126    * This is for the leftovers of the lexical analysis of a text.
127    */

128   public static final byte UNKNOWN = -2;
129     
130
131   //---------------------------------------------------------------------------
132
// Getter- und Setter-Methoden
133
//
134

135   /**
136    * Setting the type property of the <code>Token</code>. This is one of the constants
137    * defined in this class.
138    *
139    * @param type the token type
140    * @see #getType
141    */

142   public void setType(int type) {
143     _type = type;
144   }
145     
146   /**
147    * Obtaining the type of the <code>Token</code>. This is one of the constants
148    * defined in the <code>Token</code> class.
149    *
150    * @return the token type
151    * @see #setType
152    */

153   public int getType() {
154     return _type;
155   }
156     
157   /**
158    * Setting the token image. Note that some {@link Tokenizer} only fill position
159    * and length information rather than setting the token image. This strategy
160    * might have a tremendous influence on the parse performance and the memory
161    * allocation.
162    *
163    * @param image the token image
164    * @see #getImage
165    */

166   public void setImage(String JavaDoc image) {
167     if ((_image = image) == null) {
168       _length = 0;
169     } else {
170       _length = _image.length();
171     }
172   }
173     
174   /**
175    * Obtaining the token image as a {@link java.lang.String}. Th method returns
176    * <code>null</code> when called on an end-of-file token or if the {@link Tokenizer}
177    * producing this <code>Token</code> object, is configured to return only
178    * position informations (see {@link TokenizerProperties#F_TOKEN_POS_ONLY}).
179    *
180    * @return the token image as a {@link java.lang.String} (<code>null</code> is possible).
181    * @see #setImage
182    */

183   public String JavaDoc getImage() {
184     return _image;
185   }
186   
187   /**
188    * Image parts are substrings of a token image. The operation returns a meaningful
189    * result only, if the flag {@link TokenizerProperties#F_RETURN_IMAGE_PARTS} is
190    * set for the <code>TokenizerProperties</code>, the {@link Tokenizer} or the
191    * {@link TokenizerProperty} that "produced" the token. If that flag is not set
192    * the return value is identical to {@link #getImage}.
193    *<br>
194    * Number and contents of the image parts depend on the token type:
195    *<ul><li>
196    * {@link #NORMAL}, {@link #KEYWORD}, {@link #SPECIAL_SEQUENCE},
197    * {@link #SEPARATOR}: These token have one image part that is identical to
198    * the image itself ({@link #getImage}).
199    *</li><li>
200    * {@link #WHITESPACE}: Whitespaces have one image part for each substring
201    * on a single line without any line separators. For whitespace sequences
202    * without line separators there will be one part that is identical to the
203    * image itself ({@link #getImage}). More generally, whitespaces have
204    * <code>separatorCount + 1</code> image parts. For multi-line whitespaces
205    * some or all of these image parts can be empty.
206    *</li><li>
207    * {@link #STRING}: One image part per line containing the characters between
208    * and excluding the string start and end sequences and/or the line
209    * separators, equivalent to the handling of whitespaces. The string escape
210    * sequences are resolved. For instance, the image part of the SQL string
211    * <code>'select ''hello'' from dual'</code> is <code>select 'hello' from dual</code>.
212    * Multiline strings may have empty image parts (if emtpy lines are included
213    * in the string). The string "line1\n" has two image parts: "line1" and the
214    * empty string (since the string ends on a new line). The string "\nline2"
215    * has also two image parts: the empty string and "line2" (since the string
216    * starts on one line and ends on the next).
217    *</li><li>
218    * {@link #PATTERN}: a pattern has image parts according to the groups defined
219    * in the regular expression of the pattern. The {@link java.util.regex.Pattern}
220    * class speaks of "Capturing groups" that are expressions in parentheses.
221    * Image parts are especially important for pattern token, where the access
222    * to parts of the pattern is usually nessecary. For instance, in Java Unicode
223    * characters can be written in form of <code>"\\u[0-9A-Fa-f]{4}"</code>
224    * pattern. For further processing the hexadecimal part must be accessed.
225    * By using the pattern <code>"\\u([0-9A-Fa-f]{4})"</code>, a token containing
226    * the unicode notation <code>"\\u00AC"</code> has the two image parts
227    * <code>"\\u00AC"</code> (capturing group 0) and <code>"00AC"</code>
228    * (capturing group 1).
229    *</li><li>
230    * {@link #LINE_COMMENT}: Line comments have one image part that contains
231    * the substring after the line comment start sequence up to and excluding
232    * the line separator sequence.
233    *</li><li>
234    * {@link #BLOCK_COMMENT}: Like whitespaces and string, block comments have
235    * one image part per line they are spanning. The first part is without the
236    * block comment start sequence, the last without the block comment end
237    * sequence. The line separator sequences are also not included in the parts.
238    *</li><li>
239    * {@link #EOF}: The method returns an empty array.
240    *</li></ul>
241    * The return value is an array of strings rather than an {@link java.util.Enumeration}
242    * or {@link java.util.Iterator}, since it can be used more easily and contains
243    * only one element in a lot if not most cases.
244    *
245    * @return an array of image parts according to the token type if the flag
246    * {@link TokenizerProperties#F_RETURN_IMAGE_PARTS} is set or containing
247    * the image itself otherwise ({@link #getImage}).
248    */

249   public String JavaDoc[] getImageParts() {
250     if (_imageParts != null) {
251       return _imageParts;
252     } else {
253       return new String JavaDoc[] { getImage() };
254     }
255   }
256   
257   /**
258    * The counterpart to {@link #getImageParts}. It sets all image parts in one
259    * operation. The method accepts <code>null</code> and empty arrays.
260    *
261    * @param imageParts an array of image parts according to the token type or
262    * <code>null</code>
263    */

264   public void setImageParts(String JavaDoc[] imageParts) {
265     _imageParts = imageParts;
266   }
267     
268   /**
269    * Setting the length of the token. Some {@link Tokenizer} may prefer or may be
270    * configured not to return a token image, but only the position and length
271    * informations. This may save a lot of time whereever only a subset of the found
272    * tokens are actually needed by the user.
273    *<br>
274    * This method is an alternative to {@link #setEndPosition} depending on which
275    * information is at hand or easier to obtain for the {@link Tokenizer} producing
276    * this <code>Token</code>.
277    *<br>
278    * Note that this method is implicitely called by {@link #setImage} and
279    * {@link #setEndPosition}.
280    *
281    * @param length the length of the token
282    * @see #getLength
283    * @see #setEndPosition
284    */

285   public void setLength(int length) {
286     _length = length;
287   }
288     
289   /**
290    * Obtaining the length of the token. Note that some token types have a zero length
291    * (like EOF or UNKNOWN).
292    *
293    * @return the length of the token.
294    * @see #setLength
295    * @see #getEndPosition
296    */

297   public int getLength() {
298     return _length;
299   }
300     
301   /**
302    * Some token may have associated informations for the user of the <code>Token</code>.
303    * A popular thing would be the association of an integer constant to a special
304    * sequence or keyword to be used in fast <code>switch</code> statetents.
305    *
306    * @param companion the associated information for this token
307    */

308   public void setCompanion(Object JavaDoc companion) {
309     _companion = companion;
310   }
311     
312   /**
313    * Obtaining the associated information of the token. Can be <code>null</code>. See
314    * {@link #setCompanion} for details.
315    *
316    * @return the associated information of this token
317    */

318   public Object JavaDoc getCompanion() {
319     return _companion;
320   }
321   
322   /**
323    * Setting the start position of the token relative to the start of the input
324    * stream. For instance, the first character in a file has the start position
325    * 0.
326    *
327    * @param startPosition the position where the token starts in the input stream.
328    * @see #getStartPosition
329    * @see #setEndPosition
330    */

331   public void setStartPosition(int startPosition) {
332     _startPosition = startPosition;
333   }
334     
335   /**
336    * Obtaining the starting position of the token. If not set or not of interest,
337    * -1 is returned.
338    *
339    * @return start position of the token.
340    * @see #setStartPosition
341    * @see #getEndPosition
342    */

343   public int getStartPosition() {
344     return _startPosition;
345   }
346     
347   /**
348    * Setting the end position of the token relative to the start of the input
349    * stream. For instance, the first character in a file has the start position
350    * 0. The character at the given end position is <strong>NOT</code> part of
351    * this <code>Token</code>. This is the same principle as in the
352    * {@link java.lang.String#substring(int, int)} method.
353    *<br>
354    * This method is an alternative to {@link #setLength} depending on which
355    * information is at hand or easier to obtain for the {@link Tokenizer} producing
356    * this <code>Token</code>.
357    *<br>
358    * Note that this method <strong>MUST</strong> be called after {@link #setStartPosition}
359    * since it affects the length of the token. Its effect is in turn eliminated
360    * by calls to {@link #setLength} and {@link #setImage}
361    *
362    * @param endPosition the position where the token ends in the input stream.
363    */

364   public void setEndPosition(int endPosition) {
365     setLength(endPosition - _startPosition);
366   }
367     
368   /**
369    * Obtaining the end position of this token. Note that the return value of this
370    * method is only valid, if {@link #setStartPosition} has been called and one
371    * of the methods {@link #setImage}, {@link #setLength} or {@link #setEndPosition}.
372    *
373    * @return end position of the token.
374    * @see #setEndPosition
375    * @see #setStartPosition
376    * @see #getStartPosition
377    */

378   public int getEndPosition() {
379     return getLength() - getStartPosition();
380   }
381     
382   /**
383    * In {@link Tokenizer}'s counting lines and columns, this method is used to
384    * set the line number where the beginning of the <code>Token</code> was found.
385    * Line numbers start with 0.
386    *
387    * @param lineno line number where the token begins
388    * @see #getStartLine
389    */

390   public void setStartLine(int lineno) {
391     _startLine = lineno;
392   }
393     
394   /**
395    * Obtaining the line number where the <code>Token</code> starts. See also
396    * {@link #setStartLine} for details.<br>
397    * If a tokenizer doesn't count lines and columns, the returned value is -1.
398    *
399    * @return the line number where the token starts or -1, if no line counting is
400    * performed
401    * @see #setStartLine
402    */

403   public int getStartLine() {
404     return _startLine;
405   }
406     
407   /**
408    * In {@link Tokenizer}'s counting lines and columns, this method is used to
409    * set the column number where the beginning of the <code>Token</code> was
410    * found. Column numbers start with 0.
411    *
412    * @param colno number where the token begins
413    * @see #getStartColumn
414    */

415   public void setStartColumn(int colno) {
416     _startColumn = colno;
417   }
418     
419   /**
420    * Obtaining the column number of the <code>Token</code> start. See {@link #setStartColumn}
421    * for details.<br>
422    * If a tokenizer doesn't count lines and columns, the returned value is -1.
423    *
424    * @return the column number where the token starts or -1, if no line counting
425    * is performed
426    * @see #setStartColumn
427    */

428   public int getStartColumn() {
429     return _startColumn;
430   }
431     
432   /**
433    * In {@link Tokenizer}'s counting lines and columns, this method is used to
434    * set the line number where the end of the <code>Token</code> was found.
435    * See {@link #setStartLine} for more.<br>
436    * The end line number is the one there the first character was found that does
437    * <b><i>NOT</i></b> belongs to the token. This approach is choosen in accordance
438    * to the toIndex parameters in {@link java.lang.String#substring(int, int)}.
439    *
440    * @param lineno line number where the token ends
441    */

442   public void setEndLine(int lineno) {
443     _endLine = lineno;
444   }
445     
446   /**
447    * Obtaining the line number where the token ends. See {@link #setEndLine} for
448    * more. If a tokenizer doesn't count lines and columns, the returned value is
449    * -1.
450    *
451    * @return line number where the token ends or -1, if no line counting is
452    * performed
453    * @see #setEndLine
454    */

455   public int getEndLine() {
456     return _endLine;
457   }
458     
459   /**
460    * In {@link Tokenizer}'s counting lines and columns, this method is used to set the
461    * column number where the end of the <code>Token</code> was found.<br>
462    * The end column number is the one of the first character that does
463    * <b><i>NOT</i></b> belongs to the token. This approach is choosen in accordance
464    * to the toIndex parameters in {@link java.lang.String#substring(int, int)}.
465    *
466    * @param colno column number where the token ends
467    */

468   public void setEndColumn(int colno) {
469     _endColumn = colno;
470   }
471     
472   /**
473    * Obtaining the column number where the <code>Token</code> ends. See {@link #setEndColumn}
474    * for more.<br>
475    * If a tokenizer doesn't count lines and columns, the returned value is -1.
476    *
477    * @return column number where the token ends or -1, if no line counting is
478    * performed
479    * @see #setEndColumn
480    */

481   public int getEndColumn() {
482     return _endColumn;
483   }
484     
485  
486   //---------------------------------------------------------------------------
487
// construction
488
//
489

490   /**
491    * Default constructor.
492    */

493   public Token() {
494     this(UNKNOWN, null, null);
495   }
496   
497   /**
498    * Constructs a token of a given type. Only the type of the token is known but not
499    * its image or positions.
500    *
501    * @param type token type, one of the class constants.
502    */

503   public Token(int type) {
504     this(type, null, null);
505   }
506   
507   /**
508    * Construct a token of a given type with the given image. No position information
509    * is given.
510    *
511    * @param type token type, one of the class constants.
512    * @param image the token image itself
513    */

514   public Token(int type, String JavaDoc image) {
515     this(type, image, null);
516   }
517   
518   /**
519    * Construct a token of a given type with the given image and a companion. This
520    * constructor is most useful for keywords or special sequences.
521    *
522    * @param type token type, one of the class constants.
523    * @param image the token image itself
524    * @param companion an associated information of the token type
525    */

526   public Token(int type, String JavaDoc image, Object JavaDoc companion) {
527     setType(type);
528     setImage(image);
529     setCompanion(companion);
530     setStartPosition(-1);
531     setStartLine(-1);
532     setStartColumn(-1);
533     setEndLine(-1);
534     setEndColumn(-1);
535     setImageParts(null);
536   }
537   
538
539   //---------------------------------------------------------------------------
540
// overloaded methods
541
//
542

543   /**
544    * Implementation of the well known method {@link java.lang.Object#equals}.
545    * Note that two token are equal if every member of it is equal. That means
546    * that token retrieved by two different {@link Tokenizer} instances can be
547    * equal.
548    *
549    * @param object the {@link java.lang.Object} to compare
550    * @return <code>true</code> if two token are equal, <code>false</code>
551    * otherwise
552    */

553   public boolean equals(Object JavaDoc object) {
554     // Test on intentical objects and incompatible classes
555
if (object == null) {
556       return false;
557     } else if (object == this) {
558       return true;
559     } else if (object.getClass() != getClass()) {
560       return false;
561     }
562     
563     // real check
564
Token other = (Token)object;
565       
566     if (getType() != other.getType()) {
567       return false;
568     } else if (getStartPosition() != other.getStartPosition()) {
569       return false;
570     } else if (getLength() != other.getLength()) {
571       return false;
572     } else if (getStartLine() != other.getStartLine()) {
573       return false;
574     } else if (getStartColumn() != other.getStartColumn()) {
575       return false;
576     } else if (getEndLine() != other.getEndLine()) {
577       return false;
578     } else if (getEndColumn() != other.getEndColumn()) {
579       return false;
580     } else if ( (getCompanion() == null && other.getCompanion() != null)
581                || (getCompanion() != null && ! getCompanion().equals(other.getCompanion()))) {
582       return false;
583     } else if ( (getImage() == null && other.getImage() != null)
584                || (getImage() != null && ! getImage().equals(other.getImage()))) {
585       return false;
586     }
587     return true;
588   }
589   
590   /**
591    * Implementation of the well known method {@link java.lang.Object#toString}.
592    *
593    * @return string representation of this object
594    */

595   public String JavaDoc toString() {
596     StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
597     
598     // Type
599
buffer.append("Type ");
600     buffer.append(Token.getTypeName(getType()));
601     
602     // Image
603
if (getType() != EOF) {
604       buffer.append(": ");
605       if (getImage() != null) {
606         buffer.append('"');
607         buffer.append(getImage());
608         buffer.append('"');
609       } else {
610         buffer.append("no image, length ");
611         buffer.append(getLength());
612       }
613     }
614     return buffer.toString();
615   }
616
617   /**
618    * Getting a type name for displaying. The methode never fails even if the
619    * given type is unknown.
620    *
621    * @param type one of the Token type constants
622    * @return a string representation of the given type constant
623    */

624   public static String JavaDoc getTypeName(int type) {
625     switch (type) {
626     case NORMAL:
627       return "NORMAL";
628     case KEYWORD:
629       return "KEYWORD";
630     case STRING:
631       return "STRING";
632     case PATTERN:
633       return "PATTERN";
634     case SPECIAL_SEQUENCE:
635       return "SPECIAL_SEQUENCE";
636     case SEPARATOR:
637       return "SEPARATOR";
638     case WHITESPACE:
639       return "WHITESPACE";
640     case LINE_COMMENT:
641       return "LINE_COMMENT";
642     case BLOCK_COMMENT:
643       return "BLOCK_COMMENT";
644     case EOF:
645       return "EOF";
646     default:
647       return "UNKNOWN";
648     }
649   }
650   
651   
652   //---------------------------------------------------------------------------
653
// members
654
//
655

656   /**
657    * The token type. Usually one of the constants {@link #NORMAL}, {@link #EOF} etc.
658    *
659    * @see #getType
660    * @see #setType
661    */

662   protected int _type;
663
664   /**
665    * The string representing the token. This member might not be present if a
666    * {@link Tokenizer} is configured not to return token images.
667    *
668    * @see #getImage
669    * @see #setImage
670    */

671   protected String JavaDoc _image;
672
673   /**
674    * The length of the string representing the token..
675    *
676    * @see #getLength
677    * @see #setLength
678    */

679   protected int _length;
680
681   /**
682    * An information associated with the token. For instance, keywords can be
683    * distinguished using different companions for each keyword
684    *
685    * @see #getCompanion
686    * @see #setCompanion
687    * @see TokenizerProperties#addKeyword
688    */

689   protected Object JavaDoc _companion;
690
691   /**
692    * The absolute position where the token starts in the source of data.
693    *
694    * @see #getStartPosition
695    * @see #setStartPosition
696    */

697   protected int _startPosition;
698
699   /**
700    * The line where the token starts in the source of data. This member may not
701    * be set if a {@link Tokenizer} is configured not to return token line and
702    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
703    *
704    * @see #getStartLine
705    * @see #setStartLine
706    */

707   protected int _startLine;
708
709   /**
710    * The column where the token starts in the source of data. This member may not
711    * be set if a {@link Tokenizer} is configured not to return token line and
712    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
713    *
714    * @see #getStartColumn
715    * @see #setStartColumn
716    */

717   protected int _startColumn;
718
719   /**
720    * The line where the token ends in the source of data. This member may not
721    * be set if a {@link Tokenizer} is configured not to return token line and
722    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
723    *
724    * @see #getEndLine
725    * @see #setEndLine
726    */

727   protected int _endLine;
728
729   /**
730    * The column where the token ends in the source of data. This member may not
731    * be set if a {@link Tokenizer} is configured not to return token line and
732    * column (see {@link TokenizerProperties#F_COUNT_LINES}).
733    *
734    * @see #getEndColumn
735    * @see #setEndColumn
736    */

737   protected int _endColumn;
738   
739   /**
740    * Array with the image parts. See {@link #getImageParts} for details.
741    */

742   protected String JavaDoc[] _imageParts;
743 }
744
Popular Tags