TokenListUpdater


1   /*
2    * The contents of this file are subject to the terms of the Common Development
3    * and Distribution License (the License). You may not use this file except in
4    * compliance with the License.
5    *
6    * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7    * or http://www.netbeans.org/cddl.txt.
8    *
9    * When distributing Covered Code, include this CDDL Header Notice in each file
10   * and include the License file at http://www.netbeans.org/cddl.txt.
11   * If applicable, add the following below the CDDL Header, with the fields
12   * enclosed by brackets [] replaced by your own identifying information:
13   * "Portions Copyrighted [year] [name of copyright owner]"
14   *
15   * The Original Software is NetBeans. The Initial Developer of the Original
16   * Software is Sun Microsystems, Inc. Portions Copyright 1997-2007 Sun
17   * Microsystems, Inc. All Rights Reserved.
18   */
19  
20  package org.netbeans.lib.lexer.inc;
21  
22  import org.netbeans.api.lexer.TokenId;
23  import org.netbeans.lib.lexer.EmbeddedTokenList;
24  import org.netbeans.lib.lexer.LanguageOperation;
25  import org.netbeans.lib.lexer.LexerInputOperation;
26  import org.netbeans.lib.lexer.LexerUtilsConstants;
27  import org.netbeans.lib.lexer.token.AbstractToken;
28  import org.netbeans.spi.lexer.TokenValidator;
29  
30  
31  /**
32   * Token updater fixes a list of tokens constructed for a document
33   * after text of the document gets modified.
34   * <br>
35   * Subclasses need to define all the abstract methods
36   * so that the updating method can work on real token sequences.
37   *
38   * <p>
39   * Updater looks similar to list iterator
40   * but there are differences in the semantics
41   * of iterator's modification operations.
42   * <p>
43   * The algorithm used in the {@link #update(int, int)}
44   * is based on "General Incremental Lexical Analysis" written
45   * by Tim A. Wagner and Susan L. Graham, University
46   * of California, Berkeley. It's available online
47   * at <a HREF="http://www.cs.berkeley.edu/Research/Projects/harmonia/papers/twagner-lexing.pdf">
48   * twagner-lexing.pdf</a>.
49   * <br>
50   * Ending <code>EOF</code> token is not used but the lookahead
51   * of the ending token(s) is increased by one (past the end of the input)
52   * if they have reached the EOF.
53   * <br>
54   * Non-startable tokens are not supported.
55   * <br>
56   * When updating a token with lookback one as a result
57   * of modification the lookahead of the preceding token is inspected
58   * to find out whether the modification has really affected it.
59   * This can often save the previous token from being relexed.
60   * <br>
61   * Currently the algorithm computes the lookback values on the fly
62   * and it does not store the lookback in the tokens. For typical languages
63   * the lookback is reasonably small (0, 1 or 2) so it's usually not worth
64   * to consume extra space in token instances for storing of the lookback.
65   * There would also be an additional overhead of updating the lookback
66   * values in the tokens after the modification and the algorithm code would
67   * be somewhat less readable.
68   *
69   * <p>
70   * The algorithm removes the affected tokens in the natural order as they
71   * follow in the token stream. That can be used when the removed tokens
72   * need to be collected (e.g. in an array).
73   * <br>
74   * If the offset and state after token recognition matches
75   * the end offset and state after recognition of the originally present
76   * token then the relexing is stopped because a match was found and the newly
77   * produced tokens would match the present ones.
78   * <br>
79   * Otherwise the token(s) in the list are removed and replaced
80   * by the relexed token and the relexing continues until a match is reached.
81   *
82   * @author Miloslav Metelka
83   * @version 1.00
84   */
85  
86  public final class TokenListUpdater {
87  
88      /**
89       * Use incremental algorithm to update the list of tokens
90       * after a modification done in the underlying storage.
91       */
92      public static <T extends TokenId> void update(MutableTokenList<T> tokenList,
93      TokenHierarchyEventInfo eventInfo, TokenListChange<T> change) {
94          // Ensure the offsets in token list are up-to-date
95          if (tokenList.getClass() == EmbeddedTokenList.class) {
96              ((EmbeddedTokenList<? extends TokenId>)tokenList).updateStartOffset();
97          }
98  
99          // Fetch offset where the modification occurred
100         int modOffset = eventInfo.modificationOffset();
101         LanguageOperation<T> languageOperation = LexerUtilsConstants.mostEmbeddedLanguageOperation(
102                 tokenList.languagePath());
103 
104         int tokenCount = tokenList.tokenCountCurrent(); // presently created token count
105         // Now determine which token is the first to be relexed.
106         // If it would be either modified token or previous-of-modified token
107         // (for modification right at the begining of modified token)
108         // then the token will be attempted to be validated (without running
109         // a lexer).
110         AbstractToken<T> modToken;
111         // modTokenOffset holds begining of the token in which the modification occurred.
112         int modTokenOffset;
113         // index points to the modified token
114         int index;
115 
116         if (tokenCount == 0) { // no tokens yet or all removed
117             if (!tokenList.isFullyLexed()) {
118                 // No tokens created yet (they get created lazily).
119                 return; // Do nothing in this case
120             }
121             // If fully lexed the tokens must mirror the input source -> offset == 0
122             assert (modOffset == 0);
123             modToken = null;
124             modTokenOffset = 0;
125             index = 0;
126 
127         } else { // at least one token exists
128             // Check whether the modification at modOffset might affect existing tokens
129             // Get index of the token in which the modification occurred
130             // Get the offset of the last token into modTokenOffset variable
131             index = tokenCount - 1;
132             modTokenOffset = tokenList.tokenOffset(index);
133             if (modOffset >= modTokenOffset) { // inside or above the last token?
134                 modToken = token(tokenList, index);
135                 int modTokenEndOffset = modTokenOffset + modToken.length();
136                 if (modOffset >= modTokenEndOffset) { // above last token
137                     // Modification was right at the end boundary of the last token
138                     // or above it (token list can be created lazily so that is valid case).
139                     // Check whether the last token could be affected at all
140                     // by checking the last token's lookahead.
141                     // For fully lexed inputs the characters added to the end
142                     // must be properly lexed and notified (even if the last present
143                     // token has zero lookahead).
144                     if (!tokenList.isFullyLexed()
145                         && modOffset >= modTokenEndOffset + tokenList.lookahead(index)
146                     )
147                         return; // not affected at all
148 
149                     index++;
150                     modToken = null;
151                     modTokenOffset = modTokenEndOffset;
152                 } // else -> modification inside the last token
153 
154             } else { // modification in non-last token
155                 // Find modified token by binary search
156                 int low = 0; // use index as 'high'
157                 while (low < index) {
158                     int mid = (low + index) / 2;
159                     int midStartOffset = tokenList.tokenOffset(mid);
160 
161                     if (midStartOffset < modOffset) {
162                         low = mid + 1;
163                     } else if (midStartOffset > modOffset) {
164                         index = mid - 1 < 0 ? 0 : mid - 1;
165                     } else {
166                         // Token starting exactly at modOffset found
167                         index = mid;
168                         modTokenOffset = midStartOffset;
169                         break;
170                     }
171                 }
172                 if (index <= low) { // no token starting right at 'modOffset'
173                     modTokenOffset = tokenList.tokenOffset(index);
174                 }
175                 modToken = token(tokenList, index);
176             }
177         }
178 
179         // Store the index that points to the modified token
180         // i.e. modification at its begining or inside.
181         // Index variable can later be modified but present value is important
182         // for moving of the offset gap later.
183         change.setOffsetGapIndex(index);
184 
185         // Index and offset from which the relexing will start.
186         int relexIndex;
187         int relexOffset;
188         // Whether the token validation should be attempted or not.
189         boolean attemptValidation = false;
190 
191         if (index == 0) { // modToken is first in the list
192             relexIndex = index;
193             relexOffset = modTokenOffset;
194             // Can validate modToken if removal does not span whole token
195             if (modToken != null && eventInfo.removedLength() < modToken.length()) {
196                 attemptValidation = true;
197             }
198 
199         } else { // Previous token exists
200             // Check for insert-only right at the end of the previous token
201             if (modOffset == modTokenOffset && eventInfo.removedLength() == 0) {
202                 index--; // move to previous token
203                 modToken = token(tokenList, index);
204                 modTokenOffset -= modToken.length();
205             }
206 
207             // Check whether modification affected previous token
208             if (index == 0 || modTokenOffset + tokenList.lookahead(index - 1) <= modOffset) {
209                 // Modification did not affect previous token
210                 relexIndex = index;
211                 relexOffset = modTokenOffset;
212                 // Check whether modification was localized to modToken only
213                 if (modOffset + eventInfo.removedLength() < modTokenOffset + modToken.length()) {
214                     attemptValidation = true;
215                 }
216 
217             } else { // at least previous token affected
218                 relexOffset = modTokenOffset - token(tokenList, index - 1).length();
219                 relexIndex = index - 2; // Start with token below previous token
220                 
221                 // Go back and mark all affected tokens for removals
222                 while (relexIndex >= 0) {
223                     AbstractToken<T> token = token(tokenList, relexIndex);
224                     // Check if token was not affected by modification
225                     if (relexOffset + tokenList.lookahead(relexIndex) <= modOffset) {
226                         break;
227                     }
228                     relexIndex--;
229                     relexOffset -= token.length();
230                 }
231                 relexIndex++; // Next token will be relexed
232             }
233         }
234         
235         // The lowest offset at which the relexing can end
236         // (the relexing may end at higher offset if the relexed
237         // tokens will end at different boundaries than the original
238         // tokens or if the states after the tokens' recognition
239         // will differ from the original states in the original tokens.
240         int matchOffset;
241 
242         // Perform token validation of modToken if possible.
243         // The index variable will hold the token index right before the matching point.
244         if (attemptValidation) {
245             matchOffset = modTokenOffset + modToken.length();
246             TokenValidator tokenValidator = languageOperation.tokenValidator(modToken.id());
247             if (tokenValidator != null
248                 && (tokenList.getClass() != IncTokenList.class
249                     || eventInfo.tokenHierarchyOperation().canModifyToken(index, modToken))
250             ) {
251                     
252 //                if (tokenValidator.validateToken(modToken, modOffset - modTokenOffset, modRelOffset,
253 //                        removedLength, insertedLength)
254 //                ) {
255 //                    // Update positions
256 //                                change.initRemovedAddedOffsets()
257 
258 //                    return; // validated successfully
259 //                }
260             }
261 
262         } else { // Validation cannot be attempted
263             // Need to compute matchOffset and matchIndex
264             // by iterating forward
265             if (index < tokenCount) {
266                 matchOffset = modTokenOffset + modToken.length();
267                 int removeEndOffset = modOffset + eventInfo.removedLength();
268                 while (matchOffset < removeEndOffset && index + 1 < tokenCount) {
269                     index++;
270                     matchOffset += token(tokenList, index).length();
271                 }
272 
273             } else // After last token
274                 matchOffset = modTokenOffset;
275         }
276 
277         // State from which the lexer can be started
278         Object   relexState = (relexIndex > 0) ? tokenList.state(relexIndex - 1) : null;
279         // Update the matchOffset so that it corresponds to the state
280         // after the modification
281         matchOffset += eventInfo.insertedLength() - eventInfo.removedLength();
282         change.setOffset(relexOffset);
283         
284         // Variables' values:
285         // 'index' - points to modified token. Or index == tokenCount for modification
286         //     past the last token.
287         // 'tokenCount' - token count in the original token list.
288         // 'relexIndex' - points to the token that will be relexed as first.
289         // 'relexOffset' - points to begining of the token that will be relexed as first.
290         // 'matchOffset' - points to end of token after which the fixed token list could
291         //     possibly match the original token list. Points to end of token at 'index'
292         //     variable if 'index < tokenCount' and to the end of the last token
293         //     if 'index == tokenCount'.
294 
295         // Check whether relexing is necessary.
296         // Necessary condition for no-relexing is that the matchToken
297         // has zero lookahead (if lookahead would be >0 
298         // then the matchToken would be affected and relexOffset != matchOffset).
299         // The states before relex token must match the state after the modified token
300         // In case of removal starting and ending at token boundaries
301         // the relexing might not be necessary.
302         boolean relex = (relexOffset != matchOffset)
303                 || index >= tokenCount
304                 || !LexerUtilsConstants.statesEqual(relexState, tokenList.state(index));
305 
306         // There is an extra condition that the lookahead of the matchToken
307         // must not span the next (retained) token. This condition helps to ensure
308         // that the lookaheads will be the same like during regular batch lexing.
309         // As the empty tokens are not allowed the situation may only occur
310         // for lookahead > 1.
311         int lookahead;
312         if (!relex && (lookahead = tokenList.lookahead(index)) > 1 && index + 1 < tokenCount) {
313             relex = (lookahead > token(tokenList, index + 1).length()); // check next token
314         }
315 
316         if (relex) { // Start relexing
317             LexerInputOperation<T> lexerInputOperation
318                     = tokenList.createLexerInputOperation(relexIndex, relexOffset, relexState);
319 
320             do { // Fetch new tokens from lexer as necessary
321                 AbstractToken<T> token = lexerInputOperation.nextToken();
322                 if (token == null) {
323                     attemptValidation = false;
324                     break;
325                 }
326 
327                 lookahead = lexerInputOperation.lookahead();
328                 Object   state = lexerInputOperation.lexerState();
329                 change.addToken(token, lookahead, state);
330 
331                 relexOffset += token.length();
332                 // Remove obsolete tokens that would cover the area of just lexed token
333                 if (relexOffset > matchOffset && index < tokenCount) {
334                     attemptValidation = false;
335                     do {
336                         index++;
337                         if (index == tokenCount) {
338                             // Make sure the number of removed tokens will be computed properly later
339                             modToken = null;
340                             // Check whether it should lex till the end
341                             // or whether 'Match at anything' should be done
342                             if (tokenList.isFullyLexed()) {
343                                 // Will lex till the end of input
344                                 matchOffset = Integer.MAX_VALUE;
345                             } else {
346                                 // Force stop lexing
347                                 relex = false;
348                             }
349                             break;
350                         }
351                         matchOffset += token(tokenList, index).length();
352                     } while (relexOffset > matchOffset);
353                 }
354 
355                 // Check whether the new token ends at matchOffset with the same state
356                 // like the original which typically means end of relexing
357                 if (relexOffset == matchOffset
358                     && (index < tokenCount)
359                     && LexerUtilsConstants.statesEqual(state, tokenList.state(index))
360                 ) {
361                     // Check whether lookaheads stored in the subsequent tokens in the chain
362                     // reflect the lookahead of the just relexed token. Otherwise the algorithm
363                     // would not work properly.
364                     // See initial part of SimpleRandomTest.test() verifies this.
365                     //
366                     // The algorithm also attempts to have the same lookaheads in tokens
367                     // like the regular batch scanning would produce.
368                     // Although this is probably not strictly necessary requirement
369                     // it helps to simplify the debugging in case the lexer does not work
370                     // well in the incremental setup.
371                     // The following code checks that the lookahead of the original match token
372                     // (i.e. the token right before matchOffset) does "end" inside
373                     // the next token - if not then relexing the next token is done.
374                     // The second part of SimpleRandomTest.test() verifies this.
375                     int matchTokenLookahead = tokenList.lookahead(index);
376                     boolean lookaheadOK = true;
377                     // When assuming non-empty tokens the lookahead 1
378                     // just reaches the end of the next token
379                     // so lookhead < 1 is always fine from this point of view.
380                     if (matchTokenLookahead > 1 || lookahead > 1) {
381                         int laCheckIndex = index + 1;
382                         int laCheckTokenOffset = matchOffset;
383                         while (laCheckIndex < tokenCount) {
384                             int laCheckTokenLength = token(tokenList, laCheckIndex).length();
385                             lookahead -= laCheckTokenLength;
386                             matchTokenLookahead -= laCheckTokenLength;
387                             laCheckTokenOffset += laCheckTokenLength;
388                             if (lookahead <= 0 && matchTokenLookahead <=0) {
389                                 break; // No more work
390                             }
391                             if (lookahead != tokenList.lookahead(laCheckIndex)
392                                     || matchTokenLookahead > 0
393                             ) {
394                                 // This token must be relexed
395                                 index = laCheckIndex;
396                                 matchOffset = laCheckTokenOffset;
397                                 lookaheadOK = false;
398                                 // Continue - further tokens may be affected
399                             }
400                             laCheckIndex++;
401                         }
402                     }
403 
404                     if (lookaheadOK) {
405                         if (attemptValidation) {
406 //                            if (modToken.id() == token.id()
407 //                                    && tokenList.lookahead(index) == lookahead
408 //                                    && !modToken.isFlyweight()
409 //                                    && !token.isFlyweight()
410 //                                    && (tokenList.getClass() != IncTokenList.class
411 //                                        || change.tokenHierarchyOperation().canModifyToken(index, modToken))
412 //                                    && LexerSpiTokenPackageAccessor.get().restoreToken(
413 //                                            languageOperation.tokenHandler(),
414 //                                            modToken, token)
415 //                            ) {
416 //                                // Restored successfully
417 //                                // TODO implement - fix token's length and return
418 //                                // now default in fact to failed validation
419 //                            }
420                             attemptValidation = false;
421                         }
422                         break;
423                     }
424                 }
425             } while (relex);
426             lexerInputOperation.release();
427         }
428 
429         // Now ensure that the original tokens will be replaced by the relexed ones.
430         change.setIndex(relexIndex);
431         change.setAddedEndOffset(relexOffset);
432         tokenList.replaceTokens(eventInfo, change,
433             (modToken != null) ? (index - relexIndex + 1) : (index - relexIndex));
434     }
435     
436     private static <T extends TokenId> AbstractToken<T> token(MutableTokenList<T> tokenList, int index) {
437         Object   tokenOrEmbeddingContainer = tokenList.tokenOrEmbeddingContainerUnsync(index); // Unsync impl suffices
438         return LexerUtilsConstants.token(tokenOrEmbeddingContainer);
439     }
440 
441 }
442
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags