Node


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Joshua Kerievsky
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2004/07/02 00:49:26 $
10  // $Revision: 1.51 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser;
28  
29  import org.htmlparser.lexer.Page;
30  import org.htmlparser.util.NodeList;
31  import org.htmlparser.util.ParserException;
32  import org.htmlparser.visitors.NodeVisitor;
33  
34  /**
35   * Specifies the minimum requirements for nodes returned by the Lexer or Parser.
36   * There are three types of nodes in HTML: text, remarks and tags. You may wish
37   * to define your own nodes to be returned by the
38   * {@link org.htmlparser.lexer.Lexer} or {@link Parser}, but each of the types
39   * must support this interface. 
40   * More specific interface requirements for each of the node types are specified
41   * by the {@link Text}, {@link Remark} and {@link Tag} interfaces.
42   */
43  public interface Node
44      extends
45          Cloneable  
46  {
47      /**
48       * A string representation of the node.
49       * This is an important method, it allows a simple string transformation
50       * of a web page, regardless of a node. For a Text node this is obviously
51       * the textual contents itself. For a Remark node this is the remark
52       * contents (sic). For tags this is the text contents of it's children
53       * (if any). Because multiple nodes are combined when presenting
54       * a page in a browser, this will not reflect what a user would see.
55       * See HTML specification section 9.1 White space
56       * <a HREF="http://www.w3.org/TR/html4/struct/text.html#h-9.1">
57       * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>.<br>
58       * Typical application code (for extracting only the text from a web page)
59       * would be:<br>
60       * <pre>
61       * for (Enumeration e = parser.elements (); e.hasMoreElements ();)
62       *     // or do whatever processing you wish with the plain text string
63       *     System.out.println ((Node)e.nextElement ()).toPlainTextString ());
64       * </pre>
65       * @return The text of this node including it's children.
66       */
67      public abstract String   toPlainTextString ();
68  
69      /**
70       * Return the HTML for this node.
71       * This should be the exact sequence of characters that were encountered by
72       * the parser that caused this node to be created. Where this breaks down is
73       * where broken nodes (tags and remarks) have been encountered and fixed. 
74       * Applications reproducing html can use this method on nodes which are to
75       * be used or transferred as they were received or created.
76       * @return The (exact) sequence of characters that would cause this node
77       * to be returned by the parser or lexer.
78       */
79      public abstract String   toHtml ();
80  
81      /**
82       * Return the string representation of the node.
83       * The return value may not be the entire contents of the node, and non-
84       * printable characters may be translated in order to make them visible. 
85       * This is typically to be used in
86       * the manner<br>
87       * <pre>
88       * System.out.println (node);
89       * </pre>
90       * or within a debugging environment.
91       * @return A string representation of this node suitable for printing,
92       * that isn't too large.
93       */
94      public abstract String   toString ();
95  
96      /**
97       * Collect this node and its child nodes (if applicable) into a list, provided the node
98       * satisfies the filtering criteria.<P>
99       *
100      * This mechanism allows powerful filtering code to be written very easily,
101      * without bothering about collection of embedded tags separately.
102      * e.g. when we try to get all the links on a page, it is not possible to
103      * get it at the top-level, as many tags (like form tags), can contain
104      * links embedded in them. We could get the links out by checking if the
105      * current node is a {@link org.htmlparser.tags.CompositeTag}, and going
106      * through its children. So this method provides a convenient way to do this.<P>
107      *
108      * Using collectInto(), programs get a lot shorter. Now, the code to
109      * extract all links from a page would look like:
110      * <pre>
111      * NodeList list = new NodeList ();
112      * NodeFilter filter = new TagNameFilter ("A");
113      * for (NodeIterator e = parser.elements (); e.hasMoreNodes ();)
114      *      e.nextNode ().collectInto (list, filter);
115      * </pre>
116      * Thus, <code>list</code> will hold all the link nodes, irrespective of how
117      * deep the links are embedded.<P>
118      *
119      * Another way to accomplish the same objective is:
120      * <pre>
121      * NodeList list = new NodeList ();
122      * NodeFilter filter = new TagClassFilter (LinkTag.class);
123      * for (NodeIterator e = parser.elements (); e.hasMoreNodes ();)
124      *      e.nextNode ().collectInto (list, filter);
125      * </pre>
126      * This is slightly less specific because the LinkTag class may be
127      * registered for more than one node name, e.g. &lt;LINK&gt; tags too.
128      * @param list The list to collect nodes into.
129      * @param filter The criteria to use when deciding if a node should
130      * be added to the list.
131      */
132     public abstract void collectInto (NodeList list, NodeFilter filter);
133 
134     /**
135      * Returns the beginning position of the tag.
136      * <br>deprecated Use {@link #getStartPosition}
137      */
138     public abstract int elementBegin ();
139 
140     /**
141      * Returns the ending position fo the tag
142      * <br>deprecated Use {@link #getEndPosition}
143      */
144     public abstract int elementEnd ();
145 
146     /**
147      * Gets the starting position of the node.
148      * This is the character (not byte) offset of this node in the page.
149      * @return The start position.
150      */
151     public abstract int getStartPosition ();
152 
153     /**
154      * Sets the starting position of the node.
155      * @param position The new start position.
156      */
157     public abstract void setStartPosition (int position);
158 
159     /**
160      * Gets the ending position of the node.
161      * This is the character (not byte) offset of the character following this
162      * node in the page.
163      * @return The end position.
164      */
165     public abstract int getEndPosition ();
166 
167     /**
168      * Sets the ending position of the node.
169      * @param position The new end position.
170      */
171     public abstract void setEndPosition (int position);
172 
173     /**
174      * Get the page this node came from.
175      * @return The page that supplied this node.
176      */
177     public Page getPage ();
178 
179     /**
180      * Set the page this node came from.
181      * @param page The page that supplied this node.
182      */
183     public void setPage (Page page);
184     /**
185      * Apply the visitor to this node.
186      * @param visitor The visitor to this node.
187      */
188     public abstract void accept (NodeVisitor visitor);
189 
190     /**
191      * Get the parent of this node.
192      * This will always return null when parsing with the
193      * {@link org.htmlparser.lexer.Lexer}.
194      * Currently, the object returned from this method can be safely cast to a
195      * {@link org.htmlparser.tags.CompositeTag}, but this behaviour should not
196      * be expected in the future.
197      * @return The parent of this node, if it's been set, <code>null</code>
198      * otherwise.
199      */
200     public abstract Node getParent ();
201 
202     /**
203      * Sets the parent of this node.
204      * @param node The node that contains this node.
205      */
206     public abstract void setParent (Node node);
207 
208     /**
209      * Get the children of this node.
210      * @return The list of children contained by this node, if it's been set,
211      * <code>null</code> otherwise.
212      */
213     public abstract NodeList getChildren ();
214 
215     /**
216      * Set the children of this node.
217      * @param children The new list of children this node contains.
218      */
219     public abstract void setChildren (NodeList children);
220 
221     /**
222      * Returns the text of the node.
223      * @return The contents of the string or remark node, and in the case of
224      * a tag, the contents of the tag less the enclosing angle brackets.
225      */
226     public String   getText ();
227 
228     /**
229      * Sets the string contents of the node.
230      * @param text The new text for the node.
231      */
232     public void setText (String   text);
233 
234     /**
235      * Perform the meaning of this tag.
236      * This is defined by the tag, for example the bold tag &lt;B&gt; may switch
237      * bold text on and off.
238      * Only a few tags have semantic meaning to the parser. These have to do
239      * with the character set to use (&lt;META&gt;) and the base URL to use
240      * (&lt;BASE&gt;). Other than that, the semantic meaning is up to the
241      * application and it's custom nodes.<br>
242      * The semantic action is performed when the node has been parsed. For
243      * composite nodes (those that contain other nodes), the children will have
244      * already been parsed and will be available via {@link #getChildren}.
245      */
246     public void doSemanticAction ()
247         throws
248             ParserException;
249 
250     //
251     // Cloneable interface
252     //
253 
254     /**
255      * Allow cloning of nodes.
256      * Creates and returns a copy of this object.  The precise meaning 
257      * of "copy" may depend on the class of the object. The general 
258      * intent is that, for any object <tt>x</tt>, the expression:
259      * <blockquote>
260      * <pre>
261      * x.clone() != x</pre></blockquote>
262      * will be true, and that the expression:
263      * <blockquote>
264      * <pre>
265      * x.clone().getClass() == x.getClass()</pre></blockquote>
266      * will be <tt>true</tt>, but these are not absolute requirements. 
267      * While it is typically the case that:
268      * <blockquote>
269      * <pre>
270      * x.clone().equals(x)</pre></blockquote>
271      * will be <tt>true</tt>, this is not an absolute requirement. 
272      * <p>
273      * By convention, the returned object should be obtained by calling
274      * <tt>super.clone</tt>.  If a class and all of its superclasses (except
275      * <tt>Object</tt>) obey this convention, it will be the case that
276      * <tt>x.clone().getClass() == x.getClass()</tt>.
277      * <p>
278      * By convention, the object returned by this method should be independent
279      * of this object (which is being cloned).  To achieve this independence,
280      * it may be necessary to modify one or more fields of the object returned
281      * by <tt>super.clone</tt> before returning it.  Typically, this means
282      * copying any mutable objects that comprise the internal "deep structure"
283      * of the object being cloned and replacing the references to these
284      * objects with references to the copies.  If a class contains only
285      * primitive fields or references to immutable objects, then it is usually
286      * the case that no fields in the object returned by <tt>super.clone</tt>
287      * need to be modified.
288      * <p>
289      * The method <tt>clone</tt> for class <tt>Object</tt> performs a 
290      * specific cloning operation. First, if the class of this object does 
291      * not implement the interface <tt>Cloneable</tt>, then a 
292      * <tt>CloneNotSupportedException</tt> is thrown. Note that all arrays 
293      * are considered to implement the interface <tt>Cloneable</tt>. 
294      * Otherwise, this method creates a new instance of the class of this 
295      * object and initializes all its fields with exactly the contents of 
296      * the corresponding fields of this object, as if by assignment; the
297      * contents of the fields are not themselves cloned. Thus, this method 
298      * performs a "shallow copy" of this object, not a "deep copy" operation.
299      * <p>
300      * The class <tt>Object</tt> does not itself implement the interface 
301      * <tt>Cloneable</tt>, so calling the <tt>clone</tt> method on an object 
302      * whose class is <tt>Object</tt> will result in throwing an
303      * exception at run time.
304      *
305      * @return     a clone of this instance.
306      * @exception  CloneNotSupportedException  if the object's class does not
307      *               support the <code>Cloneable</code> interface. Subclasses
308      *               that override the <code>clone</code> method can also
309      *               throw this exception to indicate that an instance cannot
310      *               be cloned.
311      * @see java.lang.Cloneable
312      */
313     public Object   clone ()
314         throws
315             CloneNotSupportedException  ;
316 }
317
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags