HeaderTokenizer


1   /*
2    * The contents of this file are subject to the terms 
3    * of the Common Development and Distribution License 
4    * (the "License").  You may not use this file except 
5    * in compliance with the License.
6    * 
7    * You can obtain a copy of the license at 
8    * glassfish/bootstrap/legal/CDDLv1.0.txt or 
9    * https://glassfish.dev.java.net/public/CDDLv1.0.html. 
10   * See the License for the specific language governing 
11   * permissions and limitations under the License.
12   * 
13   * When distributing Covered Code, include this CDDL 
14   * HEADER in each file and include the License file at 
15   * glassfish/bootstrap/legal/CDDLv1.0.txt.  If applicable, 
16   * add the following below this CDDL HEADER, with the 
17   * fields enclosed by brackets "[]" replaced with your 
18   * own identifying information: Portions Copyright [yyyy] 
19   * [name of copyright owner]
20   */
21  
22  /*
23   * @(#)HeaderTokenizer.java 1.10 05/08/29
24   *
25   * Copyright 1997-2005 Sun Microsystems, Inc. All Rights Reserved.
26   */
27  
28  package javax.mail.internet;
29  
30  import java.util.*;
31  
32  /**
33   * This class tokenizes RFC822 and MIME headers into the basic
34   * symbols specified by RFC822 and MIME. <p>
35   *
36   * This class handles folded headers (ie headers with embedded
37   * CRLF SPACE sequences). The folds are removed in the returned
38   * tokens. 
39   *
40   * @version 1.10, 05/08/29
41   * @author  John Mani
42   */
43  
44  public class HeaderTokenizer {
45  
46      /**
47       * The Token class represents tokens returned by the 
48       * HeaderTokenizer.
49       */
50      public static class Token {
51  
52      private int type;
53      private String   value;
54  
55      /**
56       * Token type indicating an ATOM.
57       */
58      public static final int ATOM        = -1;
59  
60      /**
61       * Token type indicating a quoted string. The value 
62       * field contains the string without the quotes.
63       */
64      public static final int QUOTEDSTRING    = -2;
65  
66      /**
67       * Token type indicating a comment. The value field 
68       * contains the comment string without the comment 
69       * start and end symbols.
70       */
71      public static final int COMMENT     = -3;
72  
73      /**
74       * Token type indicating end of input.
75       */
76      public static final int  EOF        = -4;
77  
78      /**
79       * Constructor.
80       * @param   type    Token type
81       * @param   value   Token value
82       */
83      public Token(int type, String   value) {
84           this.type = type;
85           this.value = value;
86      }
87  
88      /**
89       * Return the type of the token. If the token represents a
90       * delimiter or a control character, the type is that character
91       * itself, converted to an integer. Otherwise, it's value is 
92       * one of the following:
93       * <ul>
94       * <li><code>ATOM</code> A sequence of ASCII characters 
95       *  delimited by either SPACE, CTL, "(", <"> or the 
96       *  specified SPECIALS
97       * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
98       *  within quotes
99       * <li><code>COMMENT</code> A sequence of ASCII characters 
100      *  within "(" and ")".
101      * <li><code>EOF</code> End of header
102      * </ul>
103      */
104     public int getType() {
105         return type;
106     }
107 
108     /**
109      * Returns the value of the token just read. When the current
110      * token is a quoted string, this field contains the body of the
111      * string, without the quotes. When the current token is a comment,
112      * this field contains the body of the comment.
113      *
114      * @return  token value
115      */
116     public String   getValue() {
117         return value;
118     }
119     }
120 
121     private String   string; // the string to be tokenized
122     private boolean skipComments; // should comments be skipped ?
123     private String   delimiters; // delimiter string
124     private int currentPos; // current parse position
125     private int maxPos; // string length
126     private int nextPos; // track start of next Token for next()
127     private int peekPos; // track start of next Token for peek()
128 
129     /**
130      * RFC822 specials
131      */
132     public final static String   RFC822 = "()<>@,;:\\\"\t .[]";
133 
134     /**
135      * MIME specials
136      */
137     public final static String   MIME = "()<>@,;:\\\"\t []/?=";
138 
139     // The EOF Token
140     private final static Token EOFToken = new Token(Token.EOF, null);
141 
142     /**
143      * Constructor that takes a rfc822 style header.
144      *
145      * @param   header  The rfc822 header to be tokenized
146      * @param   delimiters      Set of delimiter characters 
147      *              to be used to delimit ATOMS. These
148      *              are usually <code>RFC822</code> or 
149      *              <code>MIME</code>
150      * @param   skipComments  If true, comments are skipped and
151      *              not returned as tokens
152      */
153     public HeaderTokenizer(String   header, String   delimiters,
154                    boolean skipComments) {
155     string = (header == null) ? "" : header; // paranoia ?!
156     this.skipComments = skipComments;
157     this.delimiters = delimiters;
158     currentPos = nextPos = peekPos = 0;
159     maxPos = string.length();
160     }
161 
162     /**
163      * Constructor. Comments are ignored and not returned as tokens
164      *
165      * @param   header  The header that is tokenized
166      * @param   delimiters  The delimiters to be used
167      */
168     public HeaderTokenizer(String   header, String   delimiters) {
169     this(header, delimiters, true);
170     }
171 
172     /**
173      * Constructor. The RFC822 defined delimiters - RFC822 - are
174      * used to delimit ATOMS. Also comments are skipped and not
175      * returned as tokens
176      */
177     public HeaderTokenizer(String   header)  {
178     this(header, RFC822);
179     }
180 
181     /**
182      * Parses the next token from this String. <p>
183      *
184      * Clients sit in a loop calling next() to parse successive
185      * tokens until an EOF Token is returned.
186      *
187      * @return      the next Token
188      * @exception   ParseException if the parse fails
189      */
190     public Token next() throws ParseException   { 
191     Token tk;
192 
193     currentPos = nextPos; // setup currentPos
194     tk = getNext();
195     nextPos = peekPos = currentPos; // update currentPos and peekPos
196     return tk;
197     }
198 
199     /**
200      * Peek at the next token, without actually removing the token
201      * from the parse stream. Invoking this method multiple times
202      * will return successive tokens, until <code>next()</code> is
203      * called. <p>
204      *
205      * @return      the next Token
206      * @exception   ParseException if the parse fails
207      */
208     public Token peek() throws ParseException   {
209     Token tk;
210 
211     currentPos = peekPos; // setup currentPos
212     tk = getNext();
213     peekPos = currentPos; // update peekPos
214     return tk;
215     }
216 
217     /**
218      * Return the rest of the Header.
219      *
220      * @return String   rest of header. null is returned if we are
221      *          already at end of header
222      */
223     public String   getRemainder() {
224     return string.substring(nextPos);
225     }
226 
227     /*
228      * Return the next token starting from 'currentPos'. After the
229      * parse, 'currentPos' is updated to point to the start of the 
230      * next token.
231      */
232     private Token getNext() throws ParseException   {
233     // If we're already at end of string, return EOF
234     if (currentPos >= maxPos)
235         return EOFToken;
236 
237     // Skip white-space, position currentPos beyond the space
238     if (skipWhiteSpace() == Token.EOF)
239         return EOFToken;
240 
241     char c; 
242     int start; 
243     boolean filter = false;
244     
245     c = string.charAt(currentPos);
246 
247     // Check or Skip comments and position currentPos
248     // beyond the comment
249     while (c == '(') {
250         // Parsing comment ..
251         int nesting;
252         for (start = ++currentPos, nesting = 1; 
253          nesting > 0 && currentPos < maxPos;
254          currentPos++) {
255         c = string.charAt(currentPos);
256         if (c == '\\') {  // Escape sequence
257             currentPos++; // skip the escaped character
258             filter = true;
259         } else if (c == '\r')
260             filter = true;
261         else if (c == '(')
262             nesting++;
263         else if (c == ')')
264             nesting--;
265         }
266         if (nesting != 0)
267         throw new ParseException  ("Unbalanced comments");
268 
269         if (!skipComments) {
270         // Return the comment, if we are asked to.
271         // Note that the comment start & end markers are ignored.
272         String   s;
273         if (filter) // need to go thru the token again.
274             s = filterToken(string, start, currentPos-1);
275         else
276             s = string.substring(start,currentPos-1);
277 
278         return new Token(Token.COMMENT, s);
279         }
280 
281         // Skip any whitespace after the comment.
282         if (skipWhiteSpace() == Token.EOF)
283         return EOFToken;
284         c = string.charAt(currentPos);
285     }
286 
287     // Check for quoted-string and position currentPos 
288     //  beyond the terminating quote
289     if (c == '"') {
290         for (start = ++currentPos; currentPos < maxPos; currentPos++) {
291         c = string.charAt(currentPos);
292         if (c == '\\') { // Escape sequence
293             currentPos++;
294             filter = true;
295         } else if (c == '\r')
296             filter = true;
297         else if (c == '"') {
298             currentPos++;
299             String   s;
300 
301             if (filter)
302             s = filterToken(string, start, currentPos-1);
303             else
304             s = string.substring(start,currentPos-1);
305 
306             return new Token(Token.QUOTEDSTRING, s);
307         }
308         }
309         throw new ParseException  ("Unbalanced quoted string");
310     }
311     
312     // Check for SPECIAL or CTL
313     if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
314         currentPos++; // re-position currentPos
315         char ch[] = new char[1];
316         ch[0] = c;
317         return new Token((int)c, new String  (ch));
318     }
319 
320     // Check for ATOM
321     for (start = currentPos; currentPos < maxPos; currentPos++) {
322         c = string.charAt(currentPos);
323         // ATOM is delimited by either SPACE, CTL, "(", <"> 
324         // or the specified SPECIALS
325         if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
326         c == '"' || delimiters.indexOf(c) >= 0)
327         break;
328     }
329     return new Token(Token.ATOM, string.substring(start, currentPos));
330     }
331 
332     // Skip SPACE, HT, CR and NL
333     private int skipWhiteSpace() {
334     char c;
335     for (; currentPos < maxPos; currentPos++)
336         if (((c = string.charAt(currentPos)) != ' ') && 
337         (c != '\t') && (c != '\r') && (c != '\n'))
338         return currentPos;
339     return Token.EOF;
340     }
341 
342     /* Process escape sequences and embedded LWSPs from a comment or
343      * quoted string.
344      */
345     private static String   filterToken(String   s, int start, int end) {
346     StringBuffer   sb = new StringBuffer  ();
347     char c;
348     boolean gotEscape = false;
349     boolean gotCR = false;
350 
351     for (int i = start; i < end; i++) {
352         c = s.charAt(i);
353         if (c == '\n' && gotCR) {
354         // This LF is part of an unescaped 
355         // CRLF sequence (i.e, LWSP). Skip it.
356         gotCR = false;
357         continue;
358         }
359 
360         gotCR = false;
361         if (!gotEscape) {
362         // Previous character was NOT '\'
363         if (c == '\\') // skip this character
364             gotEscape = true;
365         else if (c == '\r') // skip this character
366             gotCR = true;
367         else // append this character
368             sb.append(c);
369         } else {
370         // Previous character was '\'. So no need to 
371         // bother with any special processing, just 
372         // append this character
373         sb.append(c);
374         gotEscape = false;
375         }
376     }
377     return sb.toString();
378     }
379 }
380
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags