SimpleSlopParser


1   /*
2    * Copyright 1999-2002,2004-2005 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.apache.cocoon.slop.parsing;
17  
18  import org.xml.sax.ContentHandler  ;
19  import org.xml.sax.SAXException  ;
20  import org.xml.sax.helpers.AttributesImpl  ;
21  import org.apache.cocoon.ProcessingException;
22  import org.apache.cocoon.xml.XMLUtils;
23  import org.apache.cocoon.slop.interfaces.SlopParser;
24  import org.apache.cocoon.slop.interfaces.SlopConstants;
25  
26  /**
27   * Simplistic SLOP parser, recognizes the following constructs:
28   *
29   *      Field: a line starting with letters and : is considered a field
30   *
31   *      Empty lines are detected.
32   *      Other lines are output as line elements
33   *
34   * This is sufficient for basic parsing of RFC 822 headers,
35   * but a configurable rfc822 mode would be good to differentiate
36   * between the header and body of the email message and parse them
37   * with different rules.
38   *
39   * @author <a HREF="mailto:bdelacretaz@apache.org">Bertrand Delacretaz</a>
40   * @version $Id: SimpleSlopParser.java 164808 2005-04-26 16:07:03Z vgritsenko $
41   */
42  public class SimpleSlopParser implements SlopParser,SlopConstants {
43  
44      private ContentHandler   contentHandler;
45  
46      /** chars that can be part of a field name (other than letters) */
47      private final static String   DEFAULT_TAGNAME_CHARS = "-_";
48      private String   tagnameChars = DEFAULT_TAGNAME_CHARS;
49  
50      /** valid characters in an XML element name (in addition to letters and digits) */
51      final static String   VALID_TAGNAME_CHARS = "_-";
52      final static String   TAGNAME_REPLACEMENT_CHAR = "_";
53  
54      /** optionally preserve whitespace in input */
55      private boolean preserveSpace = false;
56  
57      /** count lines */
58      private int lineCounter;
59  
60      /** result of parsing a line */
61      static class ParsedLine {
62          final String   name;
63          final String   contents;
64  
65          ParsedLine(String   elementName, String   elementContents) {
66              name = filterElementName(elementName);
67              contents = elementContents;
68          }
69      }
70  
71      /** make sure element names are valid XML */
72      static String   filterElementName(String   str) {
73          final StringBuffer   sb = new StringBuffer  ();
74          for(int i=0; i < str.length(); i++) {
75              final char c = str.charAt(i);
76              if(Character.isLetter(c)) {
77                  sb.append(c);
78              } else if(Character.isDigit(c) && i > 0) {
79                  sb.append(c);
80              } else if(VALID_TAGNAME_CHARS.indexOf(c) >= 0) {
81                  sb.append(c);
82              } else {
83                  sb.append(TAGNAME_REPLACEMENT_CHAR);
84              }
85          }
86          return sb.toString();
87      }
88  
89      /** set the list of valid chars for tag names (in addition to letters) */
90      public void setValidTagnameChars(String   str) {
91          tagnameChars = (str == null ? DEFAULT_TAGNAME_CHARS : str.trim());
92      }
93  
94      /** optionally preserve whitespace in input */
95      public void setPreserveWhitespace(boolean b) {
96          preserveSpace = b;
97      }
98  
99      /** must be called before any call to processLine() */
100     public void startDocument(ContentHandler   destination)
101     throws SAXException  , ProcessingException {
102         contentHandler = destination;
103         contentHandler.startDocument();
104         contentHandler.startPrefixMapping("", SLOP_NAMESPACE_URI);
105         contentHandler.startElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT, XMLUtils.EMPTY_ATTRIBUTES);
106     }
107 
108     /** must be called once all calls to processLine() are done */
109     public void endDocument()
110     throws SAXException  , ProcessingException {
111         contentHandler.endElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT);
112         contentHandler.endPrefixMapping("");
113         contentHandler.endDocument();
114         contentHandler = null;
115     }
116 
117     /** add simple name-value attribute to attr */
118     private void setAttribute(AttributesImpl   attr,String   name,String   value) {
119         final String   ATTR_TYPE = "NMTOKEN";
120         attr.addAttribute("",name,name,ATTR_TYPE,value);
121     }
122 
123     /** call this to process input lines, does the actual parsing */
124     public void processLine(String   line)
125     throws SAXException  , ProcessingException {
126         if(contentHandler == null) {
127             throw new ProcessingException("SimpleSlopParser content handler is null (startDocument not called?)");
128         }
129 
130         // find out which element name to use, based on the contents of the line
131         final ParsedLine p = parseLine(line);
132 
133         // generate the element and its contents
134         lineCounter++;
135         final AttributesImpl   atts = new AttributesImpl  ();
136         setAttribute(atts,SLOP_ATTR_LINENUMBER,String.valueOf(lineCounter));
137         contentHandler.startElement(SLOP_NAMESPACE_URI, p.name, p.name, atts);
138         contentHandler.characters(p.contents.toCharArray(),0,p.contents.length());
139         contentHandler.endElement(SLOP_NAMESPACE_URI, p.name, p.name);
140     }
141 
142     /** parse a line, extract element name and contents */
143     protected ParsedLine parseLine(String   line) {
144         ParsedLine result = null;
145 
146         // empty lines
147         if(line == null || line.trim().length()==0) {
148             result = new ParsedLine(SLOP_EMPTY_LINE_ELEMENT,"");
149         }
150 
151         // simple extraction of field names, lines starting with alpha chars followed
152         // by a colon are parsed as follows:
153         //
154         //  input:
155         //      field-name: this line is a field
156         //  output:
157         //      <field-name>this line is a field</field-name>
158         if(result == null) {
159             final int colonPos = line.indexOf(':');
160             if(colonPos > 0) {
161                 boolean fieldFound = true;
162                 for(int i=0; i < colonPos; i++) {
163                     final char c = line.charAt(i);
164                     final boolean isFieldChar = Character.isLetter(c) || tagnameChars.indexOf(c) >= 0;
165                     if(!isFieldChar) {
166                         fieldFound = false;
167                         break;
168                     }
169                 }
170 
171                 if(fieldFound) {
172                     String   contents = "";
173                     if(line.length() > colonPos + 1) {
174                         final String   str = line.substring(colonPos+1);
175                         contents = (preserveSpace ? str : str.trim());
176                     }
177                     result = new ParsedLine(line.substring(0,colonPos),contents);
178                 }
179             }
180         }
181 
182         // default: output a line element
183         if(result == null) {
184             final String   str = (preserveSpace ? line : line.trim());
185             result = new ParsedLine(SLOP_LINE_ELEMENT,str);
186         }
187 
188         return result;
189     }
190 }
191
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags