CompositeTag


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Somik Raha
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2004/07/31 16:42:34 $
10  // $Revision: 1.79 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.tags;
28  
29  import java.util.Locale  ;
30  
31  import org.htmlparser.Node;
32  import org.htmlparser.NodeFilter;
33  import org.htmlparser.Text;
34  import org.htmlparser.nodes.AbstractNode;
35  import org.htmlparser.nodes.TagNode;
36  import org.htmlparser.Tag;
37  import org.htmlparser.scanners.CompositeTagScanner;
38  import org.htmlparser.util.NodeList;
39  import org.htmlparser.util.SimpleNodeIterator;
40  import org.htmlparser.visitors.NodeVisitor;
41  
42  /**
43   * The base class for tags that have an end tag.
44   * Provided extra accessors for the children above and beyond what the basic
45   * {@link Tag} provides. Also handles the conversion of it's children for
46   * the {@link #toHtml toHtml} method.
47   */
48  public class CompositeTag extends TagNode
49  {
50      /**
51       * The tag that causes this tag to finish.
52       * May be a virtual tag generated by the scanning logic.
53       */
54      protected Tag mEndTag;
55  
56      /**
57       * The default scanner for non-composite tags.
58       */
59      protected final static CompositeTagScanner mDefaultCompositeScanner = new CompositeTagScanner ();
60  
61      public CompositeTag ()
62      {
63          setThisScanner (mDefaultCompositeScanner);
64      }
65      
66      /**
67       * Get an iterator over the children of this node.
68       * @return Am iterator over the children of this node.
69       */
70      public SimpleNodeIterator children ()
71      {
72          SimpleNodeIterator ret;
73  
74          if (null != getChildren ())
75              ret = getChildren ().elements ();
76          else
77              ret = (new NodeList ()).elements ();
78  
79          return (ret);
80      }
81  
82      /**
83       * Get the child of this node at the given position.
84       * @param index The in the node list of the child.
85       * @return The child at that index.
86       */
87      public Node getChild (int index)
88      {
89          return (
90              (null == getChildren ()) ? null :
91              getChildren ().elementAt (index));
92      }
93  
94      /**
95       * Get the children as an array of <code>Node</code> objects.
96       * @return The children in an array.
97       */
98      public Node [] getChildrenAsNodeArray ()
99      {
100         return (
101             (null == getChildren ()) ? new Node[0] :
102             getChildren ().toNodeArray ());
103     }
104 
105     /**
106      * Remove the child at the position given.
107      * @param i The index of the child to remove.
108      */
109     public void removeChild (int i)
110     {
111         if (null != getChildren ())
112             getChildren ().remove (i);
113     }
114 
115     /**
116      * Return the child tags as an iterator.
117      * Equivalent to calling getChildren ().elements ().
118      * @return An iterator over the children.
119      */
120     public SimpleNodeIterator elements()
121     {
122         return (
123             (null == getChildren ()) ? new NodeList ().elements () :
124             getChildren ().elements ());
125     }
126 
127     public String   toPlainTextString() {
128         StringBuffer   stringRepresentation = new StringBuffer  ();
129         for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
130             stringRepresentation.append(e.nextNode().toPlainTextString());
131         }
132         return stringRepresentation.toString();
133     }
134 
135     protected void putChildrenInto(StringBuffer   sb)
136     {
137         Node node;
138         for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
139         {
140             node = e.nextNode ();
141             // eliminate virtual tags
142 //            if (!(node.getStartPosition () == node.getEndPosition ()))
143                 sb.append (node.toHtml ());
144         }
145     }
146 
147     protected void putEndTagInto(StringBuffer   sb)
148     {
149         // eliminate virtual tags
150 //        if (!(endTag.getStartPosition () == endTag.getEndPosition ()))
151             sb.append(getEndTag ().toHtml());
152     }
153 
154     public String   toHtml() {
155         StringBuffer   sb = new StringBuffer  ();
156         sb.append (super.toHtml ());
157         if (!isEmptyXmlTag())
158         {
159             putChildrenInto(sb);
160             if (null != getEndTag ()) // this test if for link tags that refuse to scan because there's no HREF attribute
161                 putEndTagInto(sb);
162         }
163         return sb.toString();
164     }
165 
166     /**
167      * Searches all children who for a name attribute. Returns first match.
168      * @param name Attribute to match in tag
169      * @return Tag Tag matching the name attribute
170      */
171     public Tag searchByName(String   name) {
172         Node node;
173         Tag tag = null;
174         boolean found = false;
175         for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) {
176             node = e.nextNode();
177             if (node instanceof Tag)
178             {
179                 tag = (Tag)node;
180                 String   nameAttribute = tag.getAttribute("NAME");
181                 if (nameAttribute!=null && nameAttribute.equals(name))
182                     found=true;
183             }
184         }
185         if (found)
186             return tag;
187         else
188             return null;
189     }
190 
191     /**
192      * Searches for all nodes whose text representation contains the search string.
193      * Collects all nodes containing the search string into a NodeList.
194      * This search is <b>case-insensitive</b> and the search string and the
195      * node text are converted to uppercase using an English locale.
196      * For example, if you wish to find any textareas in a form tag containing
197      * "hello world", the code would be:
198      * <code>
199      * NodeList nodeList = formTag.searchFor("Hello World");
200      * </code>
201      * @param searchString Search criterion.
202      * @return A collection of nodes whose string contents or
203      * representation have the <code>searchString</code> in them.
204      */
205     public NodeList searchFor (String   searchString)
206     {
207         return (searchFor (searchString, false));
208     }
209 
210     /**
211      * Searches for all nodes whose text representation contains the search string.
212      * Collects all nodes containing the search string into a NodeList.
213      * For example, if you wish to find any textareas in a form tag containing
214      * "hello world", the code would be:
215      * <code>
216      * NodeList nodeList = formTag.searchFor("Hello World");
217      * </code>
218      * @param searchString Search criterion.
219      * @param caseSensitive If <code>true</code> this search should be case
220      * sensitive. Otherwise, the search string and the node text are converted
221      * to uppercase using an English locale.
222      * @return A collection of nodes whose string contents or
223      * representation have the <code>searchString</code> in them.
224      */
225     public NodeList searchFor (String   searchString, boolean caseSensitive)
226     {
227         return (searchFor (searchString, caseSensitive, Locale.ENGLISH));
228     }
229 
230     /**
231      * Searches for all nodes whose text representation contains the search string.
232      * Collects all nodes containing the search string into a NodeList.
233      * For example, if you wish to find any textareas in a form tag containing
234      * "hello world", the code would be:
235      * <code>
236      * NodeList nodeList = formTag.searchFor("Hello World");
237      * </code>
238      * @param searchString Search criterion.
239      * @param caseSensitive If <code>true</code> this search should be case
240      * sensitive. Otherwise, the search string and the node text are converted
241      * to uppercase using the locale provided.
242      * @param locale The locale for uppercase conversion.
243      * @return A collection of nodes whose string contents or
244      * representation have the <code>searchString</code> in them.
245      */
246     public NodeList searchFor (String   searchString, boolean caseSensitive, Locale   locale)
247     {
248         Node node;
249         String   text;
250         NodeList ret;
251         
252         ret = new NodeList ();
253 
254         if (!caseSensitive)
255             searchString = searchString.toUpperCase (locale);
256         for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
257         {
258             node = e.nextNode ();
259             text = node.toPlainTextString ();
260             if (!caseSensitive)
261                 text = text.toUpperCase (locale);
262             if (-1 != text.indexOf (searchString))
263                 ret.add (node);
264         }
265 
266         return (ret);
267     }
268 
269     /**
270      * Collect all objects that are of a certain type
271      * Note that this will not check for parent types, and will not
272      * recurse through child tags
273      * @param classType The class to search for.
274      * @param recursive If true, recursively search through the children.
275      * @return A list of children found.
276      */
277     public NodeList searchFor (Class   classType, boolean recursive)
278     {
279         return (
280             (null == getChildren ()) ? new NodeList () :
281             getChildren ().searchFor (classType, recursive));
282     }
283 
284     /**
285      * Returns the node number of the first node containing the given text.
286      * This can be useful to index into the composite tag and get other children.
287      * Text is compared without case sensitivity and conversion to uppercase
288      * uses an English locale.
289      * @param text The text to search for.
290      * @return int The node index in the children list of the node containing
291      * the text or -1 if not found.
292      */
293     public int findPositionOf (String   text)
294     {
295         return (findPositionOf (text, Locale.ENGLISH));
296     }
297 
298     /**
299      * Returns the node number of the first node containing the given text.
300      * This can be useful to index into the composite tag and get other children.
301      * Text is compared without case sensitivity and conversion to uppercase
302      * uses the supplied locale.
303      * @param text The text to search for.
304      * @return int The node index in the children list of the node containing
305      * the text or -1 if not found.
306      */
307     public int findPositionOf (String   text, Locale   locale)
308     {
309         Node node;
310         int loc;
311         
312         loc = 0;
313         text = text.toUpperCase (locale);
314         for (SimpleNodeIterator e = children (); e.hasMoreNodes (); )
315         {
316             node = e.nextNode ();
317             if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text))
318                 return loc;
319             loc++;
320         }
321         return -1;
322     }
323 
324     /**
325      * Returns the node number of a child node given the node object.
326      * This would typically be used in conjuction with digUpStringNode,
327      * after which the string node's parent can be used to find the
328      * string node's position. Faster than calling findPositionOf(text)
329      * again. Note that the position is at a linear level alone - there
330      * is no recursion in this method.
331      * @param searchNode The child node to find.
332      * @return The offset of the child tag or -1 if it was not found.
333      */
334     public int findPositionOf(Node searchNode) {
335         Node node;
336         int loc = 0;
337         for (SimpleNodeIterator e=children();e.hasMoreNodes();) {
338             node = e.nextNode();
339             if (node==searchNode) {
340                 return loc;
341             }
342             loc++;
343         }
344         return -1;
345     }
346 
347     /**
348      * Get child at given index
349      * @param index The index into the child node list.
350      * @return Node The child node at the given index or null if none.
351      */
352     public Node childAt (int index)
353     {
354         return (
355             (null == getChildren ()) ? null :
356             getChildren ().elementAt (index));
357     }
358 
359     /**
360      * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node
361      * satisfies the filtering criteria.<P>
362      *
363      * This mechanism allows powerful filtering code to be written very easily,
364      * without bothering about collection of embedded tags separately.
365      * e.g. when we try to get all the links on a page, it is not possible to
366      * get it at the top-level, as many tags (like form tags), can contain
367      * links embedded in them. We could get the links out by checking if the
368      * current node is a {@link CompositeTag}, and going through its children.
369      * So this method provides a convenient way to do this.<P>
370      *
371      * Using collectInto(), programs get a lot shorter. Now, the code to
372      * extract all links from a page would look like:
373      * <pre>
374      * NodeList collectionList = new NodeList();
375      * NodeFilter filter = new TagNameFilter ("A");
376      * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
377      *      e.nextNode().collectInto(collectionList, filter);
378      * </pre>
379      * Thus, collectionList will hold all the link nodes, irrespective of how
380      * deep the links are embedded.<P>
381      *
382      * Another way to accomplish the same objective is:
383      * <pre>
384      * NodeList collectionList = new NodeList();
385      * NodeFilter filter = new TagClassFilter (LinkTag.class);
386      * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
387      *      e.nextNode().collectInto(collectionList, filter);
388      * </pre>
389      * This is slightly less specific because the LinkTag class may be
390      * registered for more than one node name, e.g. &lt;LINK&gt; tags too.
391      */
392     public void collectInto (NodeList list, NodeFilter filter)
393     {
394         super.collectInto (list, filter);
395         for (SimpleNodeIterator e = children(); e.hasMoreNodes ();)
396             e.nextNode ().collectInto (list, filter);
397         if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
398             getEndTag ().collectInto (list, filter);
399     }
400 
401     public String   getChildrenHTML() {
402         StringBuffer   buff = new StringBuffer  ();
403         for (SimpleNodeIterator e = children();e.hasMoreNodes();) {
404             AbstractNode node = (AbstractNode)e.nextNode();
405             buff.append(node.toHtml());
406         }
407         return buff.toString();
408     }
409 
410     /**
411      * Tag visiting code.
412      * Invokes <code>accept()</code> on the start tag and then
413      * walks the child list invoking <code>accept()</code> on each
414      * of the children, finishing up with an <code>accept()</code>
415      * call on the end tag. If <code>shouldRecurseSelf()</code>
416      * returns true it then asks the visitor to visit itself.
417      * @param visitor The <code>NodeVisitor</code> object to be signalled
418      * for each child and possibly this tag.
419      */
420     public void accept (NodeVisitor visitor)
421     {
422         SimpleNodeIterator children;
423         Node child;
424 
425         if (visitor.shouldRecurseSelf ())
426             visitor.visitTag (this);
427         if (visitor.shouldRecurseChildren ())
428         {
429             if (null != getChildren ())
430             {
431                 children = children ();
432                 while (children.hasMoreNodes ())
433                 {
434                     child = children.nextNode ();
435                     child.accept (visitor);
436                 }
437             }
438             if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
439                 getEndTag ().accept (visitor);
440         }
441     }
442 
443     public int getChildCount()
444     {
445         NodeList children;
446         
447         children = getChildren ();
448 
449         return ((null == children) ? 0 : children.size ());
450     }
451 
452     public Tag getEndTag()
453     {
454         return (mEndTag);
455     }
456 
457     public void setEndTag (Tag end)
458     {
459         mEndTag = end;
460     }
461 
462     /**
463      * Finds a text node, however embedded it might be, and returns
464      * it. The text node will retain links to its parents, so
465      * further navigation is possible.
466      * @param searchText
467      * @return The list of text nodes (recursively) found.
468      */
469     public Text[] digupStringNode(String   searchText) {
470         NodeList nodeList = searchFor(searchText);
471         NodeList stringNodes = new NodeList();
472         for (int i=0;i<nodeList.size();i++) {
473             Node node = nodeList.elementAt(i);
474             if (node instanceof Text) {
475                 stringNodes.add(node);
476             } else {
477                 if (node instanceof CompositeTag) {
478                     CompositeTag ctag = (CompositeTag)node;
479                     Text[] nodes = ctag.digupStringNode(searchText);
480                     for (int j=0;j<nodes.length;j++)
481                         stringNodes.add(nodes[j]);
482                 }
483             }
484         }
485         Text[] stringNode = new Text[stringNodes.size()];
486         for (int i=0;i<stringNode.length;i++) {
487             stringNode[i] = (Text)stringNodes.elementAt(i);
488         }
489         return stringNode;
490     }
491 
492     public String   toString ()
493     {
494         StringBuffer   ret;
495         
496         ret = new StringBuffer   (1024);
497         toString (0, ret);
498         
499         return (ret.toString ());
500     }
501 
502     /**
503      * Return the text contained in this tag.
504      * @return The complete contents of the tag (within the angle brackets).
505      */
506     public String   getText ()
507     {
508         String   ret;
509         
510         ret = super.toHtml ();
511         ret = ret.substring (1, ret.length () - 1);
512         
513         return (ret);
514     }
515 
516     /**
517      * Return the text between the start tag and the end tag.
518      * @return The contents of the CompositeTag.
519      */
520     public String   getStringText ()
521     {
522         String   ret;
523         int start = getEndPosition ();
524         int end = mEndTag.getStartPosition ();
525         ret = getPage ().getText (start, end);
526         
527         return (ret);
528     }
529 
530     public void toString (int level, StringBuffer   buffer)
531     {
532         Node node;
533 
534         for (int i = 0; i < level; i++)
535             buffer.append ("  ");
536         buffer.append (super.toString ());
537         buffer.append (System.getProperty ("line.separator"));
538         for (SimpleNodeIterator e = children (); e.hasMoreNodes ();)
539         {
540             node = e.nextNode ();
541             if (node instanceof CompositeTag)
542                 ((CompositeTag)node).toString (level + 1, buffer);
543             else
544             {
545                 for (int i = 0; i <= level; i++)
546                     buffer.append ("  ");
547                 buffer.append (node);
548                 buffer.append (System.getProperty ("line.separator"));
549             }
550         }
551         
552         if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
553             // eliminate virtual tags
554 //            if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ()))
555             {
556                 for (int i = 0; i <= level; i++)
557                     buffer.append ("  ");
558                 buffer.append (getEndTag ().toString ());
559                 buffer.append (System.getProperty ("line.separator"));
560             }
561     }
562 }
563
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags