TokenizerImpl


1   /*
2    * The contents of this file are subject to the terms 
3    * of the Common Development and Distribution License 
4    * (the License).  You may not use this file except in
5    * compliance with the License.
6    * 
7    * You can obtain a copy of the license at 
8    * https://glassfish.dev.java.net/public/CDDLv1.0.html or
9    * glassfish/bootstrap/legal/CDDLv1.0.txt.
10   * See the License for the specific language governing 
11   * permissions and limitations under the License.
12   * 
13   * When distributing Covered Code, include this CDDL 
14   * Header Notice in each file and include the License file 
15   * at glassfish/bootstrap/legal/CDDLv1.0.txt.  
16   * If applicable, add the following below the CDDL Header, 
17   * with the fields enclosed by brackets [] replaced by
18   * you own identifying information: 
19   * "Portions Copyrighted [year] [name of copyright owner]"
20   * 
21   * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
22   */
23  package com.sun.appserv.management.util.misc;
24  
25  import java.text.StringCharacterIterator  ;
26  import java.util.List  ;
27  import java.util.ArrayList  ;
28  import java.util.Iterator  ;
29  import java.util.Arrays  ;
30  
31  
32  class IllegalEscapeSequenceException extends TokenizerException
33  {
34      static final long serialVersionUID = 6579038898242625567L;
35      public  IllegalEscapeSequenceException( String   msg )    { super( msg ); }
36  }
37  
38  final class UnterminatedLiteralStringException extends TokenizerException
39  {
40      static final long serialVersionUID = -1327166469948605347L;
41      public  UnterminatedLiteralStringException( String   msg )    { super( msg ); }
42  }
43  
44  final class MalformedUnicodeSequenceException extends IllegalEscapeSequenceException
45  {
46      static final long serialVersionUID = 6604956430084180525L;
47      public  MalformedUnicodeSequenceException( String   msg ) { super( msg ); }
48  }
49  
50  /**
51   */
52  public final class TokenizerImpl implements Tokenizer
53  {
54      final String  []      mTokens;
55      
56          public
57      TokenizerImpl( String     input )
58          throws TokenizerException
59      {
60          this( input, new TokenizerParams() );
61      }
62      
63      private static final char   QUOTE_CHAR  = '\"';
64      
65          public
66      TokenizerImpl(
67          String            input,
68          TokenizerParams params )
69          throws TokenizerException
70      {
71          final TokenizerInternal worker = new TokenizerInternal( input, params );
72      
73          List  <Object  >    allTokens   = worker.parseTokens( );
74  
75          if ( params.mMultipleDelimsCountAsOne )
76          {
77              allTokens   = removeMultipleDelims( allTokens );
78          }
79          
80          mTokens = interpretTokenList( allTokens );
81      }
82      
83          final static List  <Object  >
84      removeMultipleDelims( List  <Object  > list )
85      {
86          final List  <Object  >      resultList  = new ArrayList  <Object  >();
87          
88          boolean lastWasDelim    = false;
89          for( final Object   value : list )
90          {
91              if ( value instanceof String   )
92              {
93                  resultList.add( value );
94                  lastWasDelim    = false;
95              }
96              else if ( ! lastWasDelim )
97              {
98                  // add the delimiter
99                  resultList.add( value );
100                 lastWasDelim    = true;
101             }
102         }
103         
104         return( resultList );
105     }
106     
107     /**
108         Interpret the parsed token list, which consists of a series of strings
109         and tokens.  We need to handle the special cases where the list starts
110         with a delimiter and/or ends with a delimiter.  Examples:
111         
112         ""  => {}
113         "." => { "", "" }
114         "..."   => { "", "", "", "" }
115         "x."    => { "x", "" }
116         ".x"    => { "", "x" }
117         "y.x"   => { "y", "x" }
118      */
119         static String  []
120     interpretTokenList( List  <Object  > list )
121     {
122         final List  <String  >      resultList  = new ArrayList  <String  >();
123 
124         boolean lastWasDelim    = true;
125 
126         for( final Object   value : list )
127         {
128             if ( value instanceof String   )
129             {
130                 resultList.add( (String  )value );
131                 lastWasDelim    = false;
132             }
133             else
134             {
135                 if ( lastWasDelim )
136                 {
137                     // this one's a delimiter, and so was the last one
138                     // insert the implicit empty string
139                     resultList.add( "" );
140                 }
141                 else
142                 {
143                     lastWasDelim    = true;
144                 }
145             }
146         }
147         
148         // a trailing delimiter implies an empty string after it
149         if ( lastWasDelim && list.size() != 0 )
150         {
151             resultList.add( "" );
152         }
153         
154         return( (String  [])resultList.toArray( new String  [ resultList.size() ] ) );
155     }
156     
157         public String   []
158     getTokens()
159     {
160         return( mTokens );
161     }
162 }
163 
164 
165 
166 final class TokenizerInternal
167 {
168     final String                      mInput;
169     final TokenizerParams           mParams;
170     final StringCharacterIterator     mIter;
171     
172     // a distinct object used to denote a delimiter
173     private static final class Delim
174     {
175         private Delim() {}
176         public static Delim getInstance()   { return( new Delim() ); }
177         public String     toString() { return( "<DELIM>" ); }
178     }
179     final static Delim  DELIM   = Delim.getInstance();
180     
181         
182     TokenizerInternal(
183         String            input,
184         TokenizerParams params )
185     {
186         mInput          = input;
187         mParams         = params;
188         mIter       = new StringCharacterIterator  ( input );
189     }
190     
191         private static boolean
192     isSpecialEscapeChar( char theChar )
193     {
194         // carriage return or newline
195         return( theChar == 'n' || theChar == 'r' || theChar == 't' ||theChar == QUOTE_CHAR );
196     }
197     
198         private boolean
199     isCallerProvidedEscapableChar( char theChar )
200     {
201         return( mParams.mEscapableChars.indexOf( theChar ) >= 0 ||
202             theChar == mParams.mEscapeChar );
203     }
204     
205         private boolean
206     isEscapableChar( char theChar )
207     {
208         return( isCallerProvidedEscapableChar( theChar ) || isSpecialEscapeChar( theChar ) );
209     }
210     
211         private boolean
212     isDelim( String   delims, char theChar )
213     {
214         return( delims.indexOf( theChar ) >= 0 || theChar == mIter.DONE );
215     }
216     
217         private static boolean
218     isDigit( char theChar )
219     {
220         return( (theChar >= '0' && theChar <= '9') );
221     }
222     
223 
224         private static boolean
225     isHexDigit( char theChar )
226     {
227         return( isDigit( theChar ) || (theChar >= 'a' && theChar <= 'f') || isUpper( theChar ) );
228     }
229     
230         private static boolean
231     isUpper( char c )
232     {
233         return( (c >= 'A' && c <= 'F') );
234     }
235     
236         private boolean
237     hasMoreChars()
238     {
239         return( mIter.current() != mIter.DONE );
240     }
241     
242         private int
243     getIndex()
244     {
245         return( mIter.getIndex() );
246     }
247     
248         private char
249     setIndex( int index )
250     {
251         return( mIter.setIndex( index ) );
252     }
253 
254         private char
255     nextChar()
256     {
257         final char  theChar = mIter.current();
258         mIter.next();
259         
260         return( theChar );
261     }
262     
263     private static final char   QUOTE_CHAR  = '\"';
264     private static final char   TAB_CHAR    = '\t';
265     
266         private char
267     decodeUnicodeSequence()
268         throws MalformedUnicodeSequenceException
269     {
270         int     value   = 0;
271         
272         try
273         {
274             for( int i = 0; i < 4; ++i )
275             {
276                 value   = (value << 4 ) | hexValue( nextChar() );
277             }
278         }
279         catch( Exception   e )
280         {
281             throw new MalformedUnicodeSequenceException( "" );
282         }
283         
284         return( (char)value );
285     }
286     
287         private static int
288     hexValue( char c )
289     {
290         if ( ! isHexDigit( c ) )
291         {
292             throw new IllegalArgumentException  ();
293         }
294         
295         int value   = 0;
296 
297         if ( isDigit( c ) )
298         {
299             value   = (int)c - (int)'0';
300         }
301         else if ( isUpper( c ) )
302         {
303             value   = (int)c - (int)'A';
304         }
305         else
306         {
307             value   = (int)c - (int)'a';
308         }
309         return( value );
310     }
311     
312         private char
313     getEscapedChar( final char inputChar )
314         throws MalformedUnicodeSequenceException,IllegalEscapeSequenceException
315     {
316         char    outChar = 0;
317         
318         if ( isCallerProvidedEscapableChar( inputChar ) )
319         {
320             outChar = inputChar;
321         }
322         else
323         {
324             switch( inputChar )
325             {
326                 default:    throw new IllegalEscapeSequenceException( "" + inputChar );
327                 case 'n':   outChar = '\n';     break;
328                 case 'r':   outChar = '\r';     break;
329                 case 't':   outChar = '\t';     break;
330                 case QUOTE_CHAR:    outChar = QUOTE_CHAR;   break;
331                 case 'u':   outChar = decodeUnicodeSequence();  break;
332             }
333         }
334         
335         return( outChar );
336     }
337     
338     
339         private String  
340     processEscapeSequence()
341     {
342         // index of the character following the escape character
343         String    s   = null;
344         
345         final char  theChar = nextChar();
346         final int   continuePos = mIter.getIndex();
347         try
348         {
349             s   = "" + getEscapedChar( theChar );
350         }
351         catch( TokenizerException e )
352         {
353             // emit the escape character and the character following it]
354             // literally, then proceed.
355             s   = mParams.mEscapeChar + "" + theChar;
356             mIter.setIndex( continuePos );
357         }
358         
359         return( s );
360     }
361     
362         ArrayList  <Object  >
363     parseTokens(   )
364         throws UnterminatedLiteralStringException,
365             MalformedUnicodeSequenceException, IllegalEscapeSequenceException
366     {
367         final StringBuffer    tok = new StringBuffer  ();
368         final ArrayList  <Object  >     tokens  = new ArrayList  <Object  >();
369         boolean             insideStringLiteral = false;
370         
371         /**
372             Escape sequences are always processed regardless of whether we're inside a
373             quoted string or not.  A quote string really only alters whether delimiters
374             are treated as literal characters, or not.
375          */
376         while ( hasMoreChars()  )
377         {
378             final char  theChar = nextChar();
379             
380             if ( theChar == mParams.mEscapeChar )
381             {
382                 if ( mParams.mEmitInvalidEscapeSequencesLiterally )
383                 {
384                     tok.append( processEscapeSequence() );
385                 }
386                 else
387                 {
388                     tok.append( getEscapedChar( nextChar() ) );
389                 }
390             }
391             else if ( theChar == Tokenizer.LITERAL_STRING_DELIM )
392             {
393                 // special cases of "", """", """""", etc require forcing an empty string out
394                 // these case have no delimiter or regular characters to cause a string to
395                 // be emitted
396                 if ( insideStringLiteral && tok.length() == 0 && tokens.size() == 0)
397                 {
398                     tokens.add( "" );
399                 }
400                 
401                 insideStringLiteral = ! insideStringLiteral;
402             }
403             else if ( insideStringLiteral )
404             {
405                 tok.append( theChar );
406             }
407             else if ( isDelim( mParams.mDelimiters, theChar ) )
408             {
409                 // we've hit a delimiter...if characters have accumulated, spit them out
410                 // then spit out the delimiter token.
411                 if ( tok.length() != 0 )
412                 {
413                     tokens.add( tok.toString() );
414                     tok.setLength( 0 );
415                 }
416                 tokens.add( DELIM );
417             }
418             else
419             {
420                 tok.append( theChar );
421             }
422         }
423         
424         if ( tok.length() != 0 )
425         {
426             tokens.add( tok.toString() );
427         }
428         
429         if ( insideStringLiteral )
430         {
431             throw new UnterminatedLiteralStringException( tok.toString() );
432         }
433         
434         return( tokens );
435     }
436 }
437 
438
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags