ScriptScanner


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2003 Somik Raha
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/03/12 17:53:10 $
10  // $Revision: 1.63 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.scanners;
28  
29  import java.util.Vector  ;
30  
31  import org.htmlparser.Attribute;
32  import org.htmlparser.Node;
33  import org.htmlparser.NodeFactory;
34  import org.htmlparser.PrototypicalNodeFactory;
35  import org.htmlparser.Remark;
36  import org.htmlparser.Tag;
37  import org.htmlparser.Text;
38  import org.htmlparser.lexer.Cursor;
39  import org.htmlparser.lexer.Lexer;
40  import org.htmlparser.lexer.Page;
41  import org.htmlparser.scanners.ScriptDecoder;
42  import org.htmlparser.tags.ScriptTag;
43  import org.htmlparser.util.NodeList;
44  import org.htmlparser.util.ParserException;
45  
46  /**
47   * The ScriptScanner handles script CDATA.
48   */
49  public class ScriptScanner
50      extends
51          CompositeTagScanner
52  {
53      /**
54       * Strict parsing of CDATA flag.
55       * If this flag is set true, the parsing of script is performed without
56       * regard to quotes. This means that erroneous script such as:
57       * <pre>
58       * document.write("&lt;/script&gt");
59       * </pre>
60       * will be parsed in strict accordance with appendix
61       * <a HREF="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
62       * B.3.2 Specifying non-HTML data</a> of the
63       * <a HREF="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a> and
64       * hence will be split into two or more nodes. Correct javascript would
65       * escape the ETAGO:
66       * <pre>
67       * document.write("&lt;\/script&gt");
68       * </pre>
69       * If true, CDATA parsing will stop at the first ETAGO ("&lt;/") no matter
70       * whether it is quoted or not. If false, balanced quotes (either single or
71       * double) will shield an ETAGO. Beacuse of the possibility of quotes within
72       * single or multiline comments, these are also parsed. In most cases,
73       * users prefer non-strict handling since there is so much broken script
74       * out in the wild.
75       */
76      public static boolean STRICT = false;
77  
78      /**
79       * Create a script scanner.
80       */
81      public ScriptScanner()
82      {
83      }
84  
85      /**
86       * Scan for script.
87       * Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
88       * @param tag The tag this scanner is responsible for.
89       * @param lexer The source of CDATA.
90       * @param stack The parse stack, <em>not used</em>.
91       */
92      public Tag scan (Tag tag, Lexer lexer, NodeList stack)
93          throws ParserException
94      {
95          String   language;
96          String   code;
97          Node content;
98          int position;
99          Node node;
100         Attribute attribute;
101         Vector   vector;
102 
103         if (tag instanceof ScriptTag)
104         {
105             language = ((ScriptTag)tag).getLanguage ();
106             if ((null != language) &&
107                 (language.equalsIgnoreCase ("JScript.Encode") ||
108                  language.equalsIgnoreCase ("VBScript.Encode")))
109             {
110                 code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ());
111                 ((ScriptTag)tag).setScriptCode (code);
112             }
113         }
114         content = lexer.parseCDATA (!STRICT);
115         position = lexer.getPosition ();
116         node = lexer.nextNode (false);
117         if (null != node)
118             if (!(node instanceof Tag) || !(   ((Tag)node).isEndTag ()
119                 && ((Tag)node).getTagName ().equals (tag.getIds ()[0])))
120             {
121                 lexer.setPosition (position);
122                 node = null;
123             }
124 
125         // build new end tag if required
126         if (null == node)
127         {
128             attribute = new Attribute ("/script", null);
129             vector = new Vector   ();
130             vector.addElement (attribute);
131             node = lexer.getNodeFactory ().createTagNode (
132                 lexer.getPage (), position, position, vector);
133         }
134         tag.setEndTag ((Tag)node);
135         if (null != content)
136         {
137             tag.setChildren (new NodeList (content));
138             content.setParent (tag);
139         }
140         node.setParent (tag);
141         tag.doSemanticAction ();
142 
143         return (tag);
144     }
145 }
146
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags