KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > tags > CompositeTag


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Somik Raha
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/07/31 16:42:34 $
10
// $Revision: 1.79 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.tags;
28
29 import java.util.Locale JavaDoc;
30
31 import org.htmlparser.Node;
32 import org.htmlparser.NodeFilter;
33 import org.htmlparser.Text;
34 import org.htmlparser.nodes.AbstractNode;
35 import org.htmlparser.nodes.TagNode;
36 import org.htmlparser.Tag;
37 import org.htmlparser.scanners.CompositeTagScanner;
38 import org.htmlparser.util.NodeList;
39 import org.htmlparser.util.SimpleNodeIterator;
40 import org.htmlparser.visitors.NodeVisitor;
41
42 /**
43  * The base class for tags that have an end tag.
44  * Provided extra accessors for the children above and beyond what the basic
45  * {@link Tag} provides. Also handles the conversion of it's children for
46  * the {@link #toHtml toHtml} method.
47  */

48 public class CompositeTag extends TagNode
49 {
50     /**
51      * The tag that causes this tag to finish.
52      * May be a virtual tag generated by the scanning logic.
53      */

54     protected Tag mEndTag;
55
56     /**
57      * The default scanner for non-composite tags.
58      */

59     protected final static CompositeTagScanner mDefaultCompositeScanner = new CompositeTagScanner ();
60
61     public CompositeTag ()
62     {
63         setThisScanner (mDefaultCompositeScanner);
64     }
65     
66     /**
67      * Get an iterator over the children of this node.
68      * @return Am iterator over the children of this node.
69      */

70     public SimpleNodeIterator children ()
71     {
72         SimpleNodeIterator ret;
73
74         if (null != getChildren ())
75             ret = getChildren ().elements ();
76         else
77             ret = (new NodeList ()).elements ();
78
79         return (ret);
80     }
81
82     /**
83      * Get the child of this node at the given position.
84      * @param index The in the node list of the child.
85      * @return The child at that index.
86      */

87     public Node getChild (int index)
88     {
89         return (
90             (null == getChildren ()) ? null :
91             getChildren ().elementAt (index));
92     }
93
94     /**
95      * Get the children as an array of <code>Node</code> objects.
96      * @return The children in an array.
97      */

98     public Node [] getChildrenAsNodeArray ()
99     {
100         return (
101             (null == getChildren ()) ? new Node[0] :
102             getChildren ().toNodeArray ());
103     }
104
105     /**
106      * Remove the child at the position given.
107      * @param i The index of the child to remove.
108      */

109     public void removeChild (int i)
110     {
111         if (null != getChildren ())
112             getChildren ().remove (i);
113     }
114
115     /**
116      * Return the child tags as an iterator.
117      * Equivalent to calling getChildren ().elements ().
118      * @return An iterator over the children.
119      */

120     public SimpleNodeIterator elements()
121     {
122         return (
123             (null == getChildren ()) ? new NodeList ().elements () :
124             getChildren ().elements ());
125     }
126
127     public String JavaDoc toPlainTextString() {
128         StringBuffer JavaDoc stringRepresentation = new StringBuffer JavaDoc();
129         for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
130             stringRepresentation.append(e.nextNode().toPlainTextString());
131         }
132         return stringRepresentation.toString();
133     }
134
135     protected void putChildrenInto(StringBuffer JavaDoc sb)
136     {
137         Node node;
138         for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
139         {
140             node = e.nextNode ();
141             // eliminate virtual tags
142
// if (!(node.getStartPosition () == node.getEndPosition ()))
143
sb.append (node.toHtml ());
144         }
145     }
146
147     protected void putEndTagInto(StringBuffer JavaDoc sb)
148     {
149         // eliminate virtual tags
150
// if (!(endTag.getStartPosition () == endTag.getEndPosition ()))
151
sb.append(getEndTag ().toHtml());
152     }
153
154     public String JavaDoc toHtml() {
155         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
156         sb.append (super.toHtml ());
157         if (!isEmptyXmlTag())
158         {
159             putChildrenInto(sb);
160             if (null != getEndTag ()) // this test if for link tags that refuse to scan because there's no HREF attribute
161
putEndTagInto(sb);
162         }
163         return sb.toString();
164     }
165
166     /**
167      * Searches all children who for a name attribute. Returns first match.
168      * @param name Attribute to match in tag
169      * @return Tag Tag matching the name attribute
170      */

171     public Tag searchByName(String JavaDoc name) {
172         Node node;
173         Tag tag = null;
174         boolean found = false;
175         for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) {
176             node = e.nextNode();
177             if (node instanceof Tag)
178             {
179                 tag = (Tag)node;
180                 String JavaDoc nameAttribute = tag.getAttribute("NAME");
181                 if (nameAttribute!=null && nameAttribute.equals(name))
182                     found=true;
183             }
184         }
185         if (found)
186             return tag;
187         else
188             return null;
189     }
190
191     /**
192      * Searches for all nodes whose text representation contains the search string.
193      * Collects all nodes containing the search string into a NodeList.
194      * This search is <b>case-insensitive</b> and the search string and the
195      * node text are converted to uppercase using an English locale.
196      * For example, if you wish to find any textareas in a form tag containing
197      * "hello world", the code would be:
198      * <code>
199      * NodeList nodeList = formTag.searchFor("Hello World");
200      * </code>
201      * @param searchString Search criterion.
202      * @return A collection of nodes whose string contents or
203      * representation have the <code>searchString</code> in them.
204      */

205     public NodeList searchFor (String JavaDoc searchString)
206     {
207         return (searchFor (searchString, false));
208     }
209
210     /**
211      * Searches for all nodes whose text representation contains the search string.
212      * Collects all nodes containing the search string into a NodeList.
213      * For example, if you wish to find any textareas in a form tag containing
214      * "hello world", the code would be:
215      * <code>
216      * NodeList nodeList = formTag.searchFor("Hello World");
217      * </code>
218      * @param searchString Search criterion.
219      * @param caseSensitive If <code>true</code> this search should be case
220      * sensitive. Otherwise, the search string and the node text are converted
221      * to uppercase using an English locale.
222      * @return A collection of nodes whose string contents or
223      * representation have the <code>searchString</code> in them.
224      */

225     public NodeList searchFor (String JavaDoc searchString, boolean caseSensitive)
226     {
227         return (searchFor (searchString, caseSensitive, Locale.ENGLISH));
228     }
229
230     /**
231      * Searches for all nodes whose text representation contains the search string.
232      * Collects all nodes containing the search string into a NodeList.
233      * For example, if you wish to find any textareas in a form tag containing
234      * "hello world", the code would be:
235      * <code>
236      * NodeList nodeList = formTag.searchFor("Hello World");
237      * </code>
238      * @param searchString Search criterion.
239      * @param caseSensitive If <code>true</code> this search should be case
240      * sensitive. Otherwise, the search string and the node text are converted
241      * to uppercase using the locale provided.
242      * @param locale The locale for uppercase conversion.
243      * @return A collection of nodes whose string contents or
244      * representation have the <code>searchString</code> in them.
245      */

246     public NodeList searchFor (String JavaDoc searchString, boolean caseSensitive, Locale JavaDoc locale)
247     {
248         Node node;
249         String JavaDoc text;
250         NodeList ret;
251         
252         ret = new NodeList ();
253
254         if (!caseSensitive)
255             searchString = searchString.toUpperCase (locale);
256         for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
257         {
258             node = e.nextNode ();
259             text = node.toPlainTextString ();
260             if (!caseSensitive)
261                 text = text.toUpperCase (locale);
262             if (-1 != text.indexOf (searchString))
263                 ret.add (node);
264         }
265
266         return (ret);
267     }
268
269     /**
270      * Collect all objects that are of a certain type
271      * Note that this will not check for parent types, and will not
272      * recurse through child tags
273      * @param classType The class to search for.
274      * @param recursive If true, recursively search through the children.
275      * @return A list of children found.
276      */

277     public NodeList searchFor (Class JavaDoc classType, boolean recursive)
278     {
279         return (
280             (null == getChildren ()) ? new NodeList () :
281             getChildren ().searchFor (classType, recursive));
282     }
283
284     /**
285      * Returns the node number of the first node containing the given text.
286      * This can be useful to index into the composite tag and get other children.
287      * Text is compared without case sensitivity and conversion to uppercase
288      * uses an English locale.
289      * @param text The text to search for.
290      * @return int The node index in the children list of the node containing
291      * the text or -1 if not found.
292      */

293     public int findPositionOf (String JavaDoc text)
294     {
295         return (findPositionOf (text, Locale.ENGLISH));
296     }
297
298     /**
299      * Returns the node number of the first node containing the given text.
300      * This can be useful to index into the composite tag and get other children.
301      * Text is compared without case sensitivity and conversion to uppercase
302      * uses the supplied locale.
303      * @param text The text to search for.
304      * @return int The node index in the children list of the node containing
305      * the text or -1 if not found.
306      */

307     public int findPositionOf (String JavaDoc text, Locale JavaDoc locale)
308     {
309         Node node;
310         int loc;
311         
312         loc = 0;
313         text = text.toUpperCase (locale);
314         for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
315         {
316             node = e.nextNode ();
317             if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text))
318                 return loc;
319             loc++;
320         }
321         return -1;
322     }
323
324     /**
325      * Returns the node number of a child node given the node object.
326      * This would typically be used in conjuction with digUpStringNode,
327      * after which the string node's parent can be used to find the
328      * string node's position. Faster than calling findPositionOf(text)
329      * again. Note that the position is at a linear level alone - there
330      * is no recursion in this method.
331      * @param searchNode The child node to find.
332      * @return The offset of the child tag or -1 if it was not found.
333      */

334     public int findPositionOf(Node searchNode) {
335         Node node;
336         int loc = 0;
337         for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
338             node = e.nextNode();
339             if (node==searchNode) {
340                 return loc;
341             }
342             loc++;
343         }
344         return -1;
345     }
346
347     /**
348      * Get child at given index
349      * @param index The index into the child node list.
350      * @return Node The child node at the given index or null if none.
351      */

352     public Node childAt (int index)
353     {
354         return (
355             (null == getChildren ()) ? null :
356             getChildren ().elementAt (index));
357     }
358
359     /**
360      * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node
361      * satisfies the filtering criteria.<P>
362      *
363      * This mechanism allows powerful filtering code to be written very easily,
364      * without bothering about collection of embedded tags separately.
365      * e.g. when we try to get all the links on a page, it is not possible to
366      * get it at the top-level, as many tags (like form tags), can contain
367      * links embedded in them. We could get the links out by checking if the
368      * current node is a {@link CompositeTag}, and going through its children.
369      * So this method provides a convenient way to do this.<P>
370      *
371      * Using collectInto(), programs get a lot shorter. Now, the code to
372      * extract all links from a page would look like:
373      * <pre>
374      * NodeList collectionList = new NodeList();
375      * NodeFilter filter = new TagNameFilter ("A");
376      * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
377      * e.nextNode().collectInto(collectionList, filter);
378      * </pre>
379      * Thus, collectionList will hold all the link nodes, irrespective of how
380      * deep the links are embedded.<P>
381      *
382      * Another way to accomplish the same objective is:
383      * <pre>
384      * NodeList collectionList = new NodeList();
385      * NodeFilter filter = new TagClassFilter (LinkTag.class);
386      * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
387      * e.nextNode().collectInto(collectionList, filter);
388      * </pre>
389      * This is slightly less specific because the LinkTag class may be
390      * registered for more than one node name, e.g. &lt;LINK&gt; tags too.
391      */

392     public void collectInto (NodeList list, NodeFilter filter)
393     {
394         super.collectInto (list, filter);
395         for (SimpleNodeIterator e = children(); e.hasMoreNodes ();)
396             e.nextNode ().collectInto (list, filter);
397         if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
398
getEndTag ().collectInto (list, filter);
399     }
400
401     public String JavaDoc getChildrenHTML() {
402         StringBuffer JavaDoc buff = new StringBuffer JavaDoc();
403         for (SimpleNodeIterator e = children();e.hasMoreNodes();) {
404             AbstractNode node = (AbstractNode)e.nextNode();
405             buff.append(node.toHtml());
406         }
407         return buff.toString();
408     }
409
410     /**
411      * Tag visiting code.
412      * Invokes <code>accept()</code> on the start tag and then
413      * walks the child list invoking <code>accept()</code> on each
414      * of the children, finishing up with an <code>accept()</code>
415      * call on the end tag. If <code>shouldRecurseSelf()</code>
416      * returns true it then asks the visitor to visit itself.
417      * @param visitor The <code>NodeVisitor</code> object to be signalled
418      * for each child and possibly this tag.
419      */

420     public void accept (NodeVisitor visitor)
421     {
422         SimpleNodeIterator children;
423         Node child;
424
425         if (visitor.shouldRecurseSelf ())
426             visitor.visitTag (this);
427         if (visitor.shouldRecurseChildren ())
428         {
429             if (null != getChildren ())
430             {
431                 children = children ();
432                 while (children.hasMoreNodes ())
433                 {
434                     child = children.nextNode ();
435                     child.accept (visitor);
436                 }
437             }
438             if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
439
getEndTag ().accept (visitor);
440         }
441     }
442
443     public int getChildCount()
444     {
445         NodeList children;
446         
447         children = getChildren ();
448
449         return ((null == children) ? 0 : children.size ());
450     }
451
452     public Tag getEndTag()
453     {
454         return (mEndTag);
455     }
456
457     public void setEndTag (Tag end)
458     {
459         mEndTag = end;
460     }
461
462     /**
463      * Finds a text node, however embedded it might be, and returns
464      * it. The text node will retain links to its parents, so
465      * further navigation is possible.
466      * @param searchText
467      * @return The list of text nodes (recursively) found.
468      */

469     public Text[] digupStringNode(String JavaDoc searchText) {
470         NodeList nodeList = searchFor(searchText);
471         NodeList stringNodes = new NodeList();
472         for (int i=0;i<nodeList.size();i++) {
473             Node node = nodeList.elementAt(i);
474             if (node instanceof Text) {
475                 stringNodes.add(node);
476             } else {
477                 if (node instanceof CompositeTag) {
478                     CompositeTag ctag = (CompositeTag)node;
479                     Text[] nodes = ctag.digupStringNode(searchText);
480                     for (int j=0;j<nodes.length;j++)
481                         stringNodes.add(nodes[j]);
482                 }
483             }
484         }
485         Text[] stringNode = new Text[stringNodes.size()];
486         for (int i=0;i<stringNode.length;i++) {
487             stringNode[i] = (Text)stringNodes.elementAt(i);
488         }
489         return stringNode;
490     }
491
492     public String JavaDoc toString ()
493     {
494         StringBuffer JavaDoc ret;
495         
496         ret = new StringBuffer JavaDoc (1024);
497         toString (0, ret);
498         
499         return (ret.toString ());
500     }
501
502     /**
503      * Return the text contained in this tag.
504      * @return The complete contents of the tag (within the angle brackets).
505      */

506     public String JavaDoc getText ()
507     {
508         String JavaDoc ret;
509         
510         ret = super.toHtml ();
511         ret = ret.substring (1, ret.length () - 1);
512         
513         return (ret);
514     }
515
516     /**
517      * Return the text between the start tag and the end tag.
518      * @return The contents of the CompositeTag.
519      */

520     public String JavaDoc getStringText ()
521     {
522         String JavaDoc ret;
523         int start = getEndPosition ();
524         int end = mEndTag.getStartPosition ();
525         ret = getPage ().getText (start, end);
526         
527         return (ret);
528     }
529
530     public void toString (int level, StringBuffer JavaDoc buffer)
531     {
532         Node node;
533
534         for (int i = 0; i < level; i++)
535             buffer.append (" ");
536         buffer.append (super.toString ());
537         buffer.append (System.getProperty ("line.separator"));
538         for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
539         {
540             node = e.nextNode ();
541             if (node instanceof CompositeTag)
542                 ((CompositeTag)node).toString (level + 1, buffer);
543             else
544             {
545                 for (int i = 0; i <= level; i++)
546                     buffer.append (" ");
547                 buffer.append (node);
548                 buffer.append (System.getProperty ("line.separator"));
549             }
550         }
551         
552         if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
553
// eliminate virtual tags
554
// if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ()))
555
{
556                 for (int i = 0; i <= level; i++)
557                     buffer.append (" ");
558                 buffer.append (getEndTag ().toString ());
559                 buffer.append (System.getProperty ("line.separator"));
560             }
561     }
562 }
563
Popular Tags