KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > scanners > CompositeTagScanner


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2003 Somik Raha
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/07/31 16:42:32 $
10
// $Revision: 1.89 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.scanners;
28
29 import java.util.Vector JavaDoc;
30
31 import org.htmlparser.Attribute;
32 import org.htmlparser.Node;
33 import org.htmlparser.Tag;
34 import org.htmlparser.lexer.Lexer;
35 import org.htmlparser.lexer.Page;
36 import org.htmlparser.scanners.Scanner;
37 import org.htmlparser.util.NodeList;
38 import org.htmlparser.util.ParserException;
39
40 /**
41  * The main scanning logic for nested tags.
42  * When asked to scan, this class gathers nodes into a heirarchy of tags.
43  */

44 public class CompositeTagScanner extends TagScanner
45 {
46     /**
47      * Determine whether to use JVM or NodeList stack.
48      * This can be set to true to get the original behaviour of
49      * recursion into composite tags on the JVM stack.
50      * This may lead to StackOverFlowException problems in some cases
51      * i.e. Windows.
52      */

53     private static final boolean mUseJVMStack = false;
54
55     /**
56      * Determine whether unexpected end tags should cause stack roll-up.
57      * This can be set to true to get the original behaviour of gathering
58      * end tags into whatever tag is open.
59      * This can be expensive, but should only be needed in the presence of
60      * bad HTML.
61      */

62     private static final boolean mLeaveEnds = false;
63
64     /**
65      * Create a composite tag scanner.
66      */

67     public CompositeTagScanner ()
68     {
69     }
70
71     /**
72      * Collect the children.
73      * <p>An initial test is performed for an empty XML tag, in which case
74      * the start tag and end tag of the returned tag are the same and it has
75      * no children.<p>
76      * If it's not an empty XML tag, the lexer is repeatedly asked for
77      * subsequent nodes until an end tag is found or a node is encountered
78      * that matches the tag ender set or end tag ender set.
79      * In the latter case, a virtual end tag is created.
80      * Each node found that is not the end tag is added to
81      * the list of children. The end tag is special and not a child.<p>
82      * Nodes that also have a CompositeTagScanner as their scanner are
83      * recursed into, which provides the nested structure of an HTML page.
84      * This method operates in two possible modes, depending on a private boolean.
85      * It can recurse on the JVM stack, which has caused some overflow problems
86      * in the past, or it can use the supplied stack argument to nest scanning
87      * of child tags within itself. The former is left as an option in the code,
88      * mostly to help subsequent modifiers visualize what the internal nesting
89      * is doing.
90      * @param tag The tag this scanner is responsible for.
91      * @param lexer The source of subsequent nodes.
92      * @param stack The parse stack. May contain pending tags that enclose
93      * this tag.
94      * @return The resultant tag (may be unchanged).
95      */

96     public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException
97     {
98         Node node;
99         Tag next;
100         String JavaDoc name;
101         Scanner scanner;
102         Tag ret;
103         
104         ret = tag;
105
106         if (ret.isEmptyXmlTag ())
107             ret.setEndTag (ret);
108         else
109             do
110             {
111                 node = lexer.nextNode (false);
112                 if (null != node)
113                 {
114                     if (node instanceof Tag)
115                     {
116                         next = (Tag)node;
117                         name = next.getTagName ();
118                         // check for normal end tag
119
if (next.isEndTag () && name.equals (ret.getTagName ()))
120                         {
121                             ret.setEndTag (next);
122                             node = null;
123                         }
124                         else if (isTagToBeEndedFor (ret, next)) // check DTD
125
{
126                             // backup one node. insert a virtual end tag later
127
lexer.setPosition (next.getStartPosition ());
128                             node = null;
129                         }
130                         else if (!next.isEndTag ())
131                         {
132                             // now recurse if there is a scanner for this type of tag
133
scanner = next.getThisScanner ();
134                             if (null != scanner)
135                             {
136                                 if (mUseJVMStack)
137                                 { // JVM stack recursion
138
node = scanner.scan (next, lexer, stack);
139                                     addChild (ret, node);
140                                 }
141                                 else
142                                 {
143                                     // fake recursion:
144
if (scanner == this)
145                                     {
146                                         if (next.isEmptyXmlTag ())
147                                         {
148                                             next.setEndTag (next);
149                                             finishTag (next, lexer);
150                                             addChild (ret, next);
151                                         }
152                                         else
153                                         {
154                                             stack.add (ret);
155                                             ret = next;
156                                         }
157                                     }
158                                     else
159                                     { // normal recursion if switching scanners
160
node = scanner.scan (next, lexer, stack);
161                                         addChild (ret, node);
162                                     }
163                                 }
164                             }
165                             else
166                                 addChild (ret, next);
167                         }
168                         else
169                         {
170                             if (!mUseJVMStack && !mLeaveEnds)
171                             {
172                                 // Since all non-end tags are consumed by the
173
// previous clause, we're here because we have an
174
// end tag with no opening tag... this could be bad.
175
// There are two cases...
176
// 1) The tag hasn't been registered, in which case
177
// we just add it as a simple child, like it's
178
// opening tag
179
// 2) There may be an opening tag further up the
180
// parse stack that needs closing.
181
// So, we ask the factory for a node like this one
182
// (since end tags never have scanners) and see
183
// if it's scanner is a composite tag scanner.
184
// If it is we walk up the parse stack looking for
185
// something that needs this end tag to finish it.
186
// If there is something, we close off all the tags
187
// walked over and continue on as if nothing
188
// happened.
189
Vector JavaDoc attributes = new Vector JavaDoc ();
190                                 attributes.addElement (new Attribute (name, null));
191                                 Tag opener = lexer.getNodeFactory ().createTagNode (
192                                     lexer.getPage (), next.getStartPosition (), next.getEndPosition (),
193                                     attributes);
194
195                                 scanner = opener.getThisScanner ();
196                                 if ((null != scanner) && (scanner == this))
197                                 {
198                                     // uh-oh
199
int index = -1;
200                                     for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--)
201                                     {
202                                         // short circuit here... assume everything on the stack has this as it's scanner
203
// we'll need to stop if either of those conditions isn't met
204
Tag boffo = (Tag)stack.elementAt (i);
205                                         if (name.equals (boffo.getTagName ()))
206                                             index = i;
207                                         else if (isTagToBeEndedFor (boffo, next)) // check DTD
208
index = i;
209                                     }
210                                     if (-1 != index)
211                                     {
212                                         // finish off the current one first
213
finishTag (ret, lexer);
214                                         addChild ((Tag)stack.elementAt (stack.size () - 1), ret);
215                                         for (int i = stack.size () - 1; i > index; i--)
216                                         {
217                                             Tag fred = (Tag)stack.remove (i);
218                                             finishTag (fred, lexer);
219                                             addChild ((Tag)stack.elementAt (i - 1), fred);
220                                         }
221                                         ret = (Tag)stack.remove (index);
222                                         node = null;
223                                     }
224                                     else
225                                         addChild (ret, next); // default behaviour
226
}
227                                 else
228                                     addChild (ret, next); // default behaviour
229
}
230                             else
231                                 addChild (ret, next);
232                         }
233                     }
234                     else
235                         addChild (ret, node);
236                 }
237
238                 if (!mUseJVMStack)
239                 {
240                     // handle coming out of fake recursion
241
if (null == node)
242                     {
243                         int depth = stack.size ();
244                         if (0 != depth)
245                         {
246                             node = stack.elementAt (depth - 1);
247                             if (node instanceof Tag)
248                             {
249                                 Tag precursor = (Tag)node;
250                                 scanner = precursor.getThisScanner ();
251                                 if (scanner == this)
252                                 {
253                                     stack.remove (depth - 1);
254                                     finishTag (ret, lexer);
255                                     addChild (precursor, ret);
256                                     ret = precursor;
257                                 }
258                                 else
259                                     node = null; // normal recursion
260
}
261                             else
262                                 node = null; // normal recursion
263
}
264                     }
265                 }
266             }
267             while (null != node);
268
269         finishTag (ret, lexer);
270
271         return (ret);
272     }
273
274     /**
275      * Add a child to the given tag.
276      * @param parent The parent tag.
277      * @param child The child node.
278      */

279     protected void addChild (Tag parent, Node child)
280     {
281         if (null == parent.getChildren ())
282             parent.setChildren (new NodeList ());
283         child.setParent (parent);
284         parent.getChildren ().add (child);
285     }
286
287     /**
288      * Finish off a tag.
289      * Perhap add a virtual end tag.
290      * Set the end tag parent as this tag.
291      * Perform the semantic acton.
292      * @param tag The tag to finish off.
293      * @param lexer A lexer positioned at the end of the tag.
294      */

295     protected void finishTag (Tag tag, Lexer lexer)
296         throws
297             ParserException
298     {
299         if (null == tag.getEndTag ())
300             tag.setEndTag (createVirtualEndTag (tag, lexer, lexer.getPage (), lexer.getCursor ().getPosition ()));
301         tag.getEndTag ().setParent (tag);
302         tag.doSemanticAction ();
303     }
304
305     /**
306      * Creates an end tag with the same name as the given tag.
307      * @param tag The tag to end.
308      * @param lexer The object containg the node factory.
309      * @param page The page the tag is on (virtually).
310      * @param position The offset into the page at which the tag is to
311      * be anchored.
312      * @return An end tag with the name '"/" + tag.getTagName()' and a start
313      * and end position at the given position. The fact these positions are
314      * equal may be used to distinguish it as a virtual tag later on.
315      */

316     protected Tag createVirtualEndTag (Tag tag, Lexer lexer, Page page, int position)
317         throws
318             ParserException
319     {
320         Tag ret;
321         String JavaDoc name;
322         Vector JavaDoc attributes;
323         
324         name = "/" + tag.getRawTagName ();
325         attributes = new Vector JavaDoc ();
326         attributes.addElement (new Attribute (name, (String JavaDoc)null));
327         ret = lexer.getNodeFactory ().createTagNode (
328                                     page, position, position, attributes);
329         
330         return (ret);
331     }
332
333     /**
334      * Determine if the current tag should be terminated by the given tag.
335      * Examines the 'enders' or 'end tag enders' lists of the current tag
336      * for a match with the given tag. Which list is chosen depends on whether
337      * tag is an end tag ('end tag enders') or not ('enders').
338      * @param current The tag that might need to be ended.
339      * @param tag The candidate tag that might end the current one.
340      * @return <code>true</code> if the name of the given tag is a member of
341      * the appropriate list.
342      */

343     public final boolean isTagToBeEndedFor (Tag current, Tag tag)
344     {
345         String JavaDoc name;
346         String JavaDoc[] ends;
347         boolean ret;
348
349         ret = false;
350
351         name = tag.getTagName ();
352         if (tag.isEndTag ())
353             ends = current.getEndTagEnders ();
354         else
355             ends = current.getEnders ();
356         for (int i = 0; i < ends.length; i++)
357             if (name.equalsIgnoreCase (ends[i]))
358             {
359                 ret = true;
360                 break;
361             }
362         
363         return (ret);
364     }
365 }
366
Popular Tags