CompositeTagScanner


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2003 Somik Raha
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2004/07/31 16:42:32 $
10  // $Revision: 1.89 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.scanners;
28  
29  import java.util.Vector  ;
30  
31  import org.htmlparser.Attribute;
32  import org.htmlparser.Node;
33  import org.htmlparser.Tag;
34  import org.htmlparser.lexer.Lexer;
35  import org.htmlparser.lexer.Page;
36  import org.htmlparser.scanners.Scanner;
37  import org.htmlparser.util.NodeList;
38  import org.htmlparser.util.ParserException;
39  
40  /**
41   * The main scanning logic for nested tags.
42   * When asked to scan, this class gathers nodes into a heirarchy of tags.
43   */
44  public class CompositeTagScanner extends TagScanner
45  {
46      /**
47       * Determine whether to use JVM or NodeList stack.
48       * This can be set to true to get the original behaviour of
49       * recursion into composite tags on the JVM stack.
50       * This may lead to StackOverFlowException problems in some cases
51       * i.e. Windows.
52       */
53      private static final boolean mUseJVMStack = false;
54  
55      /**
56       * Determine whether unexpected end tags should cause stack roll-up.
57       * This can be set to true to get the original behaviour of gathering
58       * end tags into whatever tag is open.
59       * This can be expensive, but should only be needed in the presence of
60       * bad HTML.
61       */
62      private static final boolean mLeaveEnds = false;
63  
64      /**
65       * Create a composite tag scanner.
66       */
67      public CompositeTagScanner ()
68      {
69      }
70  
71      /**
72       * Collect the children.
73       * <p>An initial test is performed for an empty XML tag, in which case
74       * the start tag and end tag of the returned tag are the same and it has
75       * no children.<p>
76       * If it's not an empty XML tag, the lexer is repeatedly asked for
77       * subsequent nodes until an end tag is found or a node is encountered
78       * that matches the tag ender set or end tag ender set.
79       * In the latter case, a virtual end tag is created.
80       * Each node found that is not the end tag is added to
81       * the list of children. The end tag is special and not a child.<p>
82       * Nodes that also have a CompositeTagScanner as their scanner are
83       * recursed into, which provides the nested structure of an HTML page.
84       * This method operates in two possible modes, depending on a private boolean.
85       * It can recurse on the JVM stack, which has caused some overflow problems
86       * in the past, or it can use the supplied stack argument to nest scanning
87       * of child tags within itself. The former is left as an option in the code,
88       * mostly to help subsequent modifiers visualize what the internal nesting
89       * is doing.
90       * @param tag The tag this scanner is responsible for.
91       * @param lexer The source of subsequent nodes.
92       * @param stack The parse stack. May contain pending tags that enclose
93       * this tag.
94       * @return The resultant tag (may be unchanged).
95       */
96      public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException
97      {
98          Node node;
99          Tag next;
100         String   name;
101         Scanner scanner;
102         Tag ret;
103         
104         ret = tag;
105 
106         if (ret.isEmptyXmlTag ())
107             ret.setEndTag (ret);
108         else
109             do
110             {
111                 node = lexer.nextNode (false);
112                 if (null != node)
113                 {
114                     if (node instanceof Tag)
115                     {
116                         next = (Tag)node;
117                         name = next.getTagName ();
118                         // check for normal end tag
119                         if (next.isEndTag () && name.equals (ret.getTagName ()))
120                         {
121                             ret.setEndTag (next);
122                             node = null;
123                         }
124                         else if (isTagToBeEndedFor (ret, next)) // check DTD
125                         {
126                             // backup one node. insert a virtual end tag later
127                             lexer.setPosition (next.getStartPosition ());
128                             node = null;
129                         }
130                         else if (!next.isEndTag ())
131                         {
132                             // now recurse if there is a scanner for this type of tag
133                             scanner = next.getThisScanner ();
134                             if (null != scanner)
135                             {
136                                 if (mUseJVMStack)
137                                 {   // JVM stack recursion
138                                     node = scanner.scan (next, lexer, stack);
139                                     addChild (ret, node);
140                                 }
141                                 else
142                                 {
143                                     // fake recursion:
144                                     if (scanner == this)
145                                     {
146                                         if (next.isEmptyXmlTag ())
147                                         {
148                                             next.setEndTag (next);
149                                             finishTag (next, lexer);
150                                             addChild (ret, next);
151                                         }
152                                         else
153                                         {
154                                             stack.add (ret);
155                                             ret = next;
156                                         }
157                                     }
158                                     else
159                                     {   // normal recursion if switching scanners
160                                         node = scanner.scan (next, lexer, stack);
161                                         addChild (ret, node);
162                                     }
163                                 }
164                             }
165                             else
166                                 addChild (ret, next);
167                         }
168                         else
169                         {
170                             if (!mUseJVMStack && !mLeaveEnds)
171                             {
172                                 // Since all non-end tags are consumed by the
173                                 // previous clause, we're here because we have an
174                                 // end tag with no opening tag... this could be bad.
175                                 // There are two cases...
176                                 // 1) The tag hasn't been registered, in which case
177                                 // we just add it as a simple child, like it's
178                                 // opening tag
179                                 // 2) There may be an opening tag further up the
180                                 // parse stack that needs closing.
181                                 // So, we ask the factory for a node like this one
182                                 // (since end tags never have scanners) and see
183                                 // if it's scanner is a composite tag scanner.
184                                 // If it is we walk up the parse stack looking for
185                                 // something that needs this end tag to finish it.
186                                 // If there is something, we close off all the tags
187                                 // walked over and continue on as if nothing
188                                 // happened.
189                                 Vector   attributes = new Vector   ();
190                                 attributes.addElement (new Attribute (name, null));
191                                 Tag opener = lexer.getNodeFactory ().createTagNode (
192                                     lexer.getPage (), next.getStartPosition (), next.getEndPosition (),
193                                     attributes);
194 
195                                 scanner = opener.getThisScanner ();
196                                 if ((null != scanner) && (scanner == this))
197                                 {
198                                     // uh-oh
199                                     int index = -1;
200                                     for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--)
201                                     {
202                                         // short circuit here... assume everything on the stack has this as it's scanner
203                                         // we'll need to stop if either of those conditions isn't met
204                                         Tag boffo = (Tag)stack.elementAt (i);
205                                         if (name.equals (boffo.getTagName ()))
206                                             index = i;
207                                         else if (isTagToBeEndedFor (boffo, next)) // check DTD
208                                             index = i;
209                                     }
210                                     if (-1 != index)
211                                     {
212                                         // finish off the current one first
213                                         finishTag (ret, lexer);
214                                         addChild ((Tag)stack.elementAt (stack.size () - 1), ret);
215                                         for (int i = stack.size () - 1; i > index; i--)
216                                         {
217                                             Tag fred = (Tag)stack.remove (i);
218                                             finishTag (fred, lexer);
219                                             addChild ((Tag)stack.elementAt (i - 1), fred);
220                                         }
221                                         ret = (Tag)stack.remove (index);
222                                         node = null;
223                                     }
224                                     else
225                                         addChild (ret, next); // default behaviour
226                                 }
227                                 else
228                                     addChild (ret, next); // default behaviour
229                             }
230                             else
231                                 addChild (ret, next);
232                         }
233                     }
234                     else
235                         addChild (ret, node);
236                 }
237 
238                 if (!mUseJVMStack)
239                 {
240                     // handle coming out of fake recursion
241                     if (null == node)
242                     {
243                         int depth = stack.size ();
244                         if (0 != depth)
245                         {
246                             node = stack.elementAt (depth - 1);
247                             if (node instanceof Tag)
248                             {
249                                 Tag precursor = (Tag)node;
250                                 scanner = precursor.getThisScanner ();
251                                 if (scanner == this)
252                                 {
253                                     stack.remove (depth - 1);
254                                     finishTag (ret, lexer);
255                                     addChild (precursor, ret);
256                                     ret = precursor;
257                                 }
258                                 else
259                                     node = null; // normal recursion
260                             }
261                             else
262                                 node = null; // normal recursion
263                         }
264                     }
265                 }
266             }
267             while (null != node);
268 
269         finishTag (ret, lexer);
270 
271         return (ret);
272     }
273 
274     /**
275      * Add a child to the given tag.
276      * @param parent The parent tag.
277      * @param child The child node.
278      */
279     protected void addChild (Tag parent, Node child)
280     {
281         if (null == parent.getChildren ())
282             parent.setChildren (new NodeList ());
283         child.setParent (parent);
284         parent.getChildren ().add (child);
285     }
286 
287     /**
288      * Finish off a tag.
289      * Perhap add a virtual end tag.
290      * Set the end tag parent as this tag.
291      * Perform the semantic acton.
292      * @param tag The tag to finish off.
293      * @param lexer A lexer positioned at the end of the tag.
294      */
295     protected void finishTag (Tag tag, Lexer lexer)
296         throws
297             ParserException
298     {
299         if (null == tag.getEndTag ())
300             tag.setEndTag (createVirtualEndTag (tag, lexer, lexer.getPage (), lexer.getCursor ().getPosition ()));
301         tag.getEndTag ().setParent (tag);
302         tag.doSemanticAction ();
303     }
304 
305     /**
306      * Creates an end tag with the same name as the given tag.
307      * @param tag The tag to end.
308      * @param lexer The object containg the node factory.
309      * @param page The page the tag is on (virtually).
310      * @param position The offset into the page at which the tag is to
311      * be anchored.
312      * @return An end tag with the name '"/" + tag.getTagName()' and a start
313      * and end position at the given position. The fact these positions are
314      * equal may be used to distinguish it as a virtual tag later on.
315      */
316     protected Tag createVirtualEndTag (Tag tag, Lexer lexer, Page page, int position)
317         throws
318             ParserException
319     {
320         Tag ret;
321         String   name;
322         Vector   attributes;
323         
324         name = "/" + tag.getRawTagName ();
325         attributes = new Vector   ();
326         attributes.addElement (new Attribute (name, (String  )null));
327         ret = lexer.getNodeFactory ().createTagNode (
328                                     page, position, position, attributes);
329         
330         return (ret);
331     }
332 
333     /**
334      * Determine if the current tag should be terminated by the given tag.
335      * Examines the 'enders' or 'end tag enders' lists of the current tag
336      * for a match with the given tag. Which list is chosen depends on whether
337      * tag is an end tag ('end tag enders') or not ('enders').
338      * @param current The tag that might need to be ended.
339      * @param tag The candidate tag that might end the current one.
340      * @return <code>true</code> if the name of the given tag is a member of
341      * the appropriate list.
342      */
343     public final boolean isTagToBeEndedFor (Tag current, Tag tag)
344     {
345         String   name;
346         String  [] ends;
347         boolean ret;
348 
349         ret = false;
350 
351         name = tag.getTagName ();
352         if (tag.isEndTag ())
353             ends = current.getEndTagEnders ();
354         else
355             ends = current.getEnders ();
356         for (int i = 0; i < ends.length; i++)
357             if (name.equalsIgnoreCase (ends[i]))
358             {
359                 ret = true;
360                 break;
361             }
362         
363         return (ret);
364     }
365 }
366
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags