Syntax


1   /*
2    * The contents of this file are subject to the terms of the Common Development
3    * and Distribution License (the License). You may not use this file except in
4    * compliance with the License.
5    *
6    * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7    * or http://www.netbeans.org/cddl.txt.
8    *
9    * When distributing Covered Code, include this CDDL Header Notice in each file
10   * and include the License file at http://www.netbeans.org/cddl.txt.
11   * If applicable, add the following below the CDDL Header, with the fields
12   * enclosed by brackets [] replaced by your own identifying information:
13   * "Portions Copyrighted [year] [name of copyright owner]"
14   *
15   * The Original Software is NetBeans. The Initial Developer of the Original
16   * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17   * Microsystems, Inc. All Rights Reserved.
18   */
19  
20  package org.netbeans.editor;
21  
22  /**
23  * Lexical analyzer that works on a given text buffer. It allows
24  * to sequentially parse a given character buffer by calling
25  * <tt>nextToken()</tt> that returns the token-ids.
26  *
27  * After the token is found by calling the <tt>nextToken</tt> method,
28  * the <tt>getTokenOffset()</tt> method can be used
29  * to get the starting offset of the current
30  * token in the buffer. The <tt>getTokenLength()</tt> gives the length
31  * of the current token.
32  *
33  * The heart of the analyzer is the <tt>parseToken()</tt> method which
34  * parses the text and returns the token-id of the last token found.
35  * The <tt>parseToken()</tt> method is called from the <tt>nextToken()</tt>.
36  * It operates with two important variables. The <tt>offset</tt>
37  * variable identifies the currently scanned character in the buffer.
38  * The <tt>tokenOffset</tt> is the begining of the current token.
39  * The <tt>state</tt> variable that identifies the current internal
40  * state of the analyzer is set accordingly when the characters are parsed.
41  * If the <tt>parseToken()</tt> recognizes a token, it returns its ID
42  * and the <tt>tokenOffset</tt> is its begining in the buffer and
43  * <tt>offset - tokenOffset</tt> is its length. When the token is processed
44  * the value of <tt>tokenOffset</tt> is set to be the same as current
45  * value of the <tt>offset</tt> and the parsing continues.
46  *
47  * Internal states are the integer constants used internally by analyzer.
48  * They are assigned to the <tt>state</tt> variable to express
49  * that the analyzer has moved from one state to another.
50  * They are usually numbered starting from zero but they don't
51  * have to. The only reserved value is -1 which is reserved
52  * for the INIT state - the initial internal state of the analyzer.
53  *
54  * There is also the support for defining the persistent info about
55  * the current state of the analyzer. This info can be later used
56  * to restore the parsing from some particular state instead of
57  * parsing from the begining of the buffer. This feature is very
58  * useful if there are the modifications performed in the document.
59  * The info is stored in the <tt>StateInfo</tt> interface
60  * with the <tt>BaseStateInfo</tt> as the basic implementation.
61  * It enables to get and set the two important values
62  * from the persistent point of view.
63  * The first one is the value of the <tt>state</tt> variable.
64  * The other one is the difference <tt>offset - tokenOffset</tt>
65  * which is called pre-scan. The particular analyzer can define
66  * additional values important for the persistent storage.
67  * The <tt>createStateInfo()</tt> can be overriden to create
68  * custom state-info and <tt>loadState()</tt> and <tt>storeState()</tt>
69  * can be overriden to get/set the additional values.
70  *
71  * The <tt>load()</tt> method sets the buffer to be parsed.
72  * There is a special parameter in the load() method called position
73  * that allows a relation of the character buffer passed to the load()
74  * method and the position of the buffer's data in the document.
75  * For this extended functionality the document must be passed
76  * to the constructor of the lexical analyzer at some level.
77  *
78  *
79  * @author Miloslav Metelka
80  * @version 1.00
81  */
82  
83  public class Syntax {
84  
85      /** Is the state of analyzer equal to a given state info? */
86      public static final int EQUAL_STATE = 0;
87  
88      /** Is the state of analyzer different from given state info? */
89      public static final int DIFFERENT_STATE = 1;
90  
91  
92      /** Initial internal state of the analyzer */
93      public static final int INIT = -1;
94  
95  
96  
97      /** Internal state of the lexical analyzer. At the begining
98      * it's set to INIT value but it is changed by <tt>parseToken()</tt>
99      * as the characters are processed one by one.
100     */
101     protected int state = INIT;
102 
103     /** Text buffer to scan */
104     protected char buffer[];
105 
106     /** Current offset in the buffer */
107     protected int offset;
108 
109     /** Offset holding the begining of the current token */
110     protected int tokenOffset;
111 
112     /** This variable is the length of the token that was found */
113     protected int tokenLength;
114 
115     /** Path from which the found token-id comes from.
116     * The <tt>TokenContext.getContextPath()</tt> can be used
117     * to get the path. If the lexical analyzer doesn't use
118     * any children token-contexts it can assign
119     * the path in the constructor.
120     */
121     protected TokenContextPath tokenContextPath;
122 
123     /** Setting this flag to true means that there are currently no more
124     * buffers available so that analyzer should return all the tokens
125     * including those whose successful scanning would be otherwise
126     * left for later when the next buffer will be available. Setting
127     * this flag to true ensures that all the characters in the current
128     * buffer will be processed.
129     * The lexical analyzer should on one hand process all the characters
130     * but on the other hand it should "save" its context. For example
131     * if the scanner finds the unclosed comment at the end of the buffer
132     * it should return the comment token but
133     * stay in the "being in comment" internal state.
134     */
135     protected boolean lastBuffer;
136 
137     /** On which offset in the buffer scanning should stop. */
138     protected int stopOffset;
139 
140     /** The position in the document that logically corresponds
141     * to the stopOffset value. If there's no relation
142     * to the document, it's -1. The reason why the relation
143     * to the document's data is expressed through
144     * the stopOffset to stopPosition relation is because
145     * the stopOffset is the only offset that doesn't change
146     * rapidly in the operation of the lexical analyzer.
147     */
148     protected int stopPosition;
149 
150     /** This variable can be populated by the parseToken() method
151     * in case the user types an errorneous construction but
152     * it's clear what correct token he meant to write.
153     * For example if the user writes a single '0x' it's an errorneous
154     * construct but it's clear that the user wants to enter
155     * the hexa-number. In this situation the parseToken()
156     * should report error, but it should also set the supposedTokenID
157     * to the hexa-number token.
158     * This information is used while drawing the text. If the caret
159     * stand inside or around such token, it calls the getSupposedTokenID()
160     * after calling the nextToken() and if it's non-null it uses it
161     * instead of the original token.
162     */
163     protected TokenID supposedTokenID;
164 
165     /** Function that should be called externally to scan the text.
166     * It manages the call to parseToken() and cares about the proper
167     * setting of the offsets.
168     * It can be extended to support any custom debugging required.
169     */
170     public TokenID nextToken() {
171         // Return immediately when at the end of buffer
172         if (tokenOffset >= stopOffset) {
173             tokenLength = 0;
174             return null; // signal no token found
175         }
176 
177         // Divide non-debug and debug sections
178         supposedTokenID = null;
179         TokenID tokenID = parseToken();
180         if (tokenID != null) { // regular token found
181             tokenLength = offset - tokenOffset;
182             tokenOffset = offset; // move to the next token
183             if (tokenLength == 0) { // test for empty token
184                 return nextToken(); // repeat until non-empty token is found
185             }
186         } else { // EOT reached
187             tokenLength = 0;
188         }
189 
190         return tokenID;
191     }
192 
193     /** This is core function of analyzer and it returns either the token-id
194     * or null to indicate that the end of buffer was found.
195     * The function scans the active character and does one or more
196     * of the following actions:
197     * 1. change internal analyzer state
198     * 2. set the token-context-path and return token-id
199     * 3. adjust current position to signal different end of token;
200     *    the character that offset points to is not included in the token
201     */
202     protected TokenID parseToken() {
203         return null;
204     }
205 
206     /** Load the state from syntax mark into analyzer. This method is used when
207     * @param stateInfo info about the state of the lexical analyzer to load.
208     *   It can be null to indicate there's no previous state so the analyzer
209     *   starts from its initial state.
210     * @param buffer buffer that will be scanned
211     * @param offset offset of the first character that will be scanned
212     * @param len length of the area to be scanned
213     * @param lastBuffer whether this is the last buffer in the document. All the tokens
214     *   will be returned including the last possibly incomplete one. If the data
215     *   come from the document, the simple rule for this parameter
216     *   is (doc.getLength() == stop-position) where stop-position
217     *   is the position corresponding to the (offset + len) in the buffer
218     *   that comes from the document data.
219     * @param stopPosition position in the document that corresponds to (offset + len) offset
220     *   in the provided buffer. It has only sense if the data in the buffer come from the document.
221     *   It helps in writing the advanced analyzers that need to interact with some other data
222     *   in the document than only those provided in the character buffer.
223     *   If there is no relation to the document data, the stopPosition parameter
224     *   must be filled with -1 which means an invalid value.
225     *   The stop-position is passed (instead of start-position) because it doesn't
226     *   change through the analyzer operation. It corresponds to the <tt>stopOffset</tt>
227     *   that also doesn't change through the analyzer operation so any
228     *   buffer-offset can be transferred to position by computing
229     *   <tt>stopPosition + buffer-offset - stopOffset</tt>
230     *   where stopOffset is the instance variable that is assigned
231     *   to <tt>offset + len</tt> in the body of relocate().
232     */
233     public void load(StateInfo stateInfo, char buffer[], int offset, int len,
234                      boolean lastBuffer, int stopPosition) {
235         this.buffer = buffer;
236         this.offset = offset;
237         this.tokenOffset = offset;
238         this.stopOffset = offset + len;
239         this.lastBuffer = lastBuffer;
240         this.stopPosition = stopPosition;
241 
242         if (stateInfo != null) {
243             loadState(stateInfo);
244         } else {
245             loadInitState();
246         }
247     }
248 
249     /** Relocate scanning to another buffer.
250     * This is used to continue scanning after previously
251     * reported EOT. Relocation delta between current offset and the requested offset
252     * is computed and all the offsets are relocated. If there's a non-zero preScan
253     * in the analyzer, it is a caller's responsibility to provide all the preScan
254     * characters in the relocation buffer.
255     * @param buffer next buffer where the scan will continue.
256     * @param offset offset where the scan will continue.
257     *   It's not decremented by the current preScan.
258     * @param len length of the area to be scanned.
259     *   It's not extended by the current preScan.
260     * @param lastBuffer whether this is the last buffer in the document. All the tokens
261     *   will be returned including the last possibly incomplete one. If the data
262     *   come from the document, the simple rule for this parameter
263     *   is (doc.getLength() == stop-position) where stop-position
264     *   is the position corresponding to the (offset + len) in the buffer
265     *   that comes from the document data.
266     * @param stopPosition position in the document that corresponds to (offset + len) offset
267     *   in the provided buffer. It has only sense if the data in the buffer come from the document.
268     *   It helps in writing the advanced analyzers that need to interact with some other data
269     *   in the document than only those provided in the character buffer.
270     *   If there is no relation to the document data, the stopPosition parameter
271     *   must be filled with -1 which means an invalid value.
272     *   The stop-position is passed (instead of start-position) because it doesn't
273     *   change through the analyzer operation. It corresponds to the <tt>stopOffset</tt>
274     *   that also doesn't change through the analyzer operation so any
275     *   buffer-offset can be transferred to position by computing
276     *   <tt>stopPosition + buffer-offset - stopOffset</tt>
277     *   where stopOffset is the instance variable that is assigned
278     *   to <tt>offset + len</tt> in the body of relocate().
279     */
280     public void relocate(char buffer[], int offset, int len,
281     boolean lastBuffer, int stopPosition) {
282         this.buffer = buffer;
283         this.lastBuffer = lastBuffer;
284 
285         int delta = offset - this.offset; // delta according to current offset
286         this.offset += delta;
287         this.tokenOffset += delta;
288         this.stopOffset = offset + len;
289         this.stopPosition = stopPosition;
290     }
291 
292     /** Get the current buffer */
293     public char[] getBuffer() {
294         return buffer;
295     }
296 
297     /** Get the current scanning offset */
298     public int getOffset() {
299         return offset;
300     }
301 
302     /** Get start of token in scanned buffer. */
303     public int getTokenOffset() {
304         return offset - tokenLength;
305     }
306 
307     /** Get length of token in scanned buffer. */
308     public int getTokenLength() {
309         return tokenLength;
310     }
311 
312     /** Get the token-context-path of the returned token. */
313     public TokenContextPath getTokenContextPath() {
314         return tokenContextPath;
315     }
316 
317     public TokenID getSupposedTokenID() {
318         return supposedTokenID;
319     }
320 
321     /** Get the pre-scan which is a number
322     * of characters between offset and tokenOffset.
323     * If there's no more characters in the current buffer,
324     * the analyzer returns EOT, but it can be in a state when
325     * there are already some characters parsed at the end of
326     * the current buffer but the token
327     * is still incomplete and it cannot be returned yet.
328     * The pre-scan value helps to determine how many characters
329     * from the end of the current buffer should be present
330     * at the begining of the next buffer so that the current
331     * incomplete token can be returned as the first token
332     * when parsing the next buffer.
333     */
334     public int getPreScan() {
335         return offset - tokenOffset;
336     }
337 
338     /** Initialize the analyzer when scanning from the begining
339     * of the document or when the state stored in syntax mark
340     * is null for some reason or to explicitly reset the analyzer
341     * to the initial state. The offsets must not be touched by this method.
342     */
343     public void loadInitState() {
344         state = INIT;
345     }
346 
347     public void reset() {
348         tokenLength = stopOffset = tokenOffset = offset = 0;
349         loadInitState();
350     }
351 
352     /** Load valid mark state into the analyzer. Offsets
353     * are already initialized when this method is called. This method
354     * must get the state from the mark and set it to the analyzer. Then
355     * it must decrease tokenOffset by the preScan stored in the mark state.
356     * @param markState mark state to be loaded into syntax. It must be non-null value.
357     */
358     public void loadState(StateInfo stateInfo) {
359         state = stateInfo.getState();
360         tokenOffset -= stateInfo.getPreScan();
361     }
362 
363     /** Store state of this analyzer into given mark state. */
364     public void storeState(StateInfo stateInfo) {
365         stateInfo.setState(state);
366         stateInfo.setPreScan(getPreScan());
367     }
368 
369     /** Compare state of this analyzer to given state info */
370     public int compareState(StateInfo stateInfo) {
371         if (stateInfo != null) {
372             return ((stateInfo.getState() == state) && stateInfo.getPreScan() == getPreScan())
373                    ? EQUAL_STATE : DIFFERENT_STATE;
374         } else {
375             return DIFFERENT_STATE;
376         }
377     }
378 
379     /** Create state info appropriate for particular analyzer */
380     public StateInfo createStateInfo() {
381         return new BaseStateInfo();
382     }
383 
384     /** Get state name as string. It can be used for debugging purposes
385     * by developer of new syntax analyzer. The states that this function
386     * recognizes can include all constants used in analyzer so that it can
387     * be used everywhere in analyzer to convert numbers to more practical strings.
388     */
389     public String   getStateName(int stateNumber) {
390         switch(stateNumber) {
391         case INIT:
392             return "INIT"; // NOI18N
393 
394         default:
395             return "Unknown state " + stateNumber; // NOI18N
396         }
397     }
398 
399     /** Syntax information as String */
400     public String   toString() {
401         return "tokenOffset=" + tokenOffset // NOI18N
402                + ", offset=" + offset // NOI18N
403                + ", state=" + getStateName(state) // NOI18N
404                + ", stopOffset=" + stopOffset // NOI18N
405                + ", lastBuffer=" + lastBuffer; // NOI18N
406     }
407 
408 
409     /** Interface that stores two basic pieces of information about
410     * the state of the whole lexical analyzer - its internal state and preScan.
411     */
412     public interface StateInfo {
413 
414         /** Get the internal state */
415         public int getState();
416 
417         /** Store the internal state */
418         public void setState(int state);
419 
420         /** Get the preScan value */
421         public int getPreScan();
422 
423         /** Store the preScan value */
424         public void setPreScan(int preScan);
425 
426     }
427 
428 
429     /** Base implementation of the StateInfo interface */
430     public static class BaseStateInfo implements StateInfo {
431 
432         /** analyzer state */
433         private int state;
434 
435         /** Pre-scan length */
436         private int preScan;
437 
438         public int getState() {
439             return state;
440         }
441 
442         public void setState(int state) {
443             this.state = state;
444         }
445 
446         public int getPreScan() {
447             return preScan;
448         }
449 
450         public void setPreScan(int preScan) {
451             this.preScan = preScan;
452         }
453 
454         public String   toString(Syntax syntax) {
455             return "state=" // NOI18N
456                 + ((syntax != null)
457                     ? syntax.getStateName(getState())
458                     : Integer.toString(getState()))
459                 + ", preScan=" + getPreScan(); // NOI18N
460         }
461         
462         public String   toString() {
463             return toString(null);
464         }
465 
466     }
467 
468 }
469
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags