KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > NodeReader


1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/NodeReader.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
2
/*
3  * ====================================================================
4  * Copyright 2002-2004 The Apache Software Foundation.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  */

19
20 // The developers of JMeter and Apache are greatful to the developers
21
// of HTMLParser for giving Apache Software Foundation a non-exclusive
22
// license. The performance benefits of HTMLParser are clear and the
23
// users of JMeter will benefit from the hard work the HTMLParser
24
// team. For detailed information about HTMLParser, the project is
25
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
26
//
27
// HTMLParser was originally created by Somik Raha in 2000. Since then
28
// a healthy community of users has formed and helped refine the
29
// design so that it is able to tackle the difficult task of parsing
30
// dirty HTML. Derrick Oswald is the current lead developer and was kind
31
// enough to assist JMeter.
32

33
34 package org.htmlparser;
35
36 //////////////////
37
// Java Imports //
38
//////////////////
39
import java.io.BufferedReader JavaDoc;
40 import java.io.IOException JavaDoc;
41 import java.io.PrintWriter JavaDoc;
42 import java.io.Reader JavaDoc;
43 import java.io.StringWriter JavaDoc;
44
45 import org.htmlparser.parserHelper.StringParser;
46 import org.htmlparser.scanners.TagScanner;
47 import org.htmlparser.tags.EndTag;
48 import org.htmlparser.tags.Tag;
49 import org.htmlparser.util.NodeList;
50 import org.htmlparser.util.ParserException;
51
52 /**
53  * NodeReader builds on the BufferedReader, providing methods to read one element
54  * at a time
55  */

56 public class NodeReader extends BufferedReader JavaDoc
57 {
58     public static final String JavaDoc DECIPHER_ERROR =
59         "NodeReader.readElement() : Error occurred while trying to decipher the tag using scanners";
60     protected int posInLine = -1;
61     protected String JavaDoc line;
62     protected Node node = null;
63     protected TagScanner previousOpenScanner = null;
64     protected String JavaDoc url;
65     private Parser parser;
66     private int lineCount;
67     private String JavaDoc previousLine;
68     private StringParser stringParser = new StringParser();
69     private RemarkNodeParser remarkNodeParser = new RemarkNodeParser();
70     private NodeList nextParsedNode = new NodeList();
71     private boolean dontReadNextLine = false;
72     /**
73      * The constructor takes in a reader object, it's length and the url to be read.
74      */

75     public NodeReader(Reader JavaDoc in, int len, String JavaDoc url)
76     {
77         super(in, len);
78         this.url = url;
79         this.parser = null;
80         this.lineCount = 1;
81     }
82     /**
83      * This constructor basically overrides the existing constructor in the
84      * BufferedReader class.
85      * The URL defaults to an empty string.
86      * @see #NodeReader(Reader,int,String)
87      */

88
89     public NodeReader(Reader JavaDoc in, int len)
90     {
91         this(in, len, "");
92     }
93     /**
94      * The constructor takes in a reader object, and the url to be read.
95      * The buffer size defaults to 8192.
96      * @see #NodeReader(Reader,int,String)
97      */

98     public NodeReader(Reader JavaDoc in, String JavaDoc url)
99     {
100         this(in, 8192, url);
101     }
102
103     /**
104      * Get the url for this reader.
105      * @return The url specified in the constructor;
106      */

107     public String JavaDoc getURL()
108     {
109         return (url);
110     }
111
112     /**
113      * This method is intended to be called only by scanners, when a situation of dirty html has arisen,
114      * and action has been taken to correct the parsed tags. For e.g. if we have html of the form :
115      * <pre>
116      * <a HREF="somelink.html"><img SRC=...><td><tr><a HREF="someotherlink.html">...</a>
117      * </pre>
118      * Now to salvage the first link, we'd probably like to insert an end tag somewhere (typically before the
119      * second begin link tag). So that the parsing continues uninterrupted, we will need to change the existing
120      * line being parsed, to contain the end tag in it.
121      */

122     public void changeLine(String JavaDoc line)
123     {
124         this.line = line;
125     }
126     public String JavaDoc getCurrentLine()
127     {
128         return line;
129     }
130     /**
131      * Get the last line number that the reader has read
132      * @return int last line number read by the reader
133      */

134     public int getLastLineNumber()
135     {
136         return lineCount - 1;
137     }
138
139     /**
140      * This method is useful when designing your own scanners. You might need to find out what is the location where the
141      * reader has stopped last.
142      * @return int Last position read by the reader
143      */

144     public int getLastReadPosition()
145     {
146         if (node != null)
147             return node.elementEnd();
148         else
149             return 0;
150     }
151
152     /*
153      * Read the next line
154      * @return String containing the line
155      */

156     public String JavaDoc getNextLine()
157     {
158         try
159         {
160             previousLine = line;
161             line = readLine();
162             if (line != null)
163                 lineCount++;
164             posInLine = 0;
165             return line;
166         }
167         catch (IOException JavaDoc e)
168         {
169             System.err.println("I/O Exception occurred while reading!");
170         }
171         return null;
172     }
173     /**
174      * Returns the parser object for which this reader exists
175      * @return org.htmlparser.Parser
176      */

177     public Parser getParser()
178     {
179         return parser;
180     }
181     /**
182      * Gets the previousOpenScanner.
183      * @return Returns a TagScanner
184      */

185     public TagScanner getPreviousOpenScanner()
186     {
187         return previousOpenScanner;
188     }
189
190     /**
191      * Returns true if the text at <code>pos</code> in <code>line</code> should be scanned as a tag.
192      * Basically an open angle followed by a known special character or a letter.
193      * @param line The current line being parsed.
194      * @param pos The position in the line to examine.
195      * @return <code>true</code> if we think this is the start of a tag.
196      */

197     private boolean beginTag(String JavaDoc line, int pos)
198     {
199         char ch;
200         boolean ret;
201
202         ret = false;
203
204         if (pos + 2 <= line.length())
205             if ('<' == line.charAt(pos))
206             {
207                 ch = line.charAt(pos + 1);
208                 // the order of these tests might be optimized for speed
209
if ('/' == ch
210                     || '%' == ch
211                     || Character.isLetter(ch)
212                     || '!' == ch)
213                     ret = true;
214             }
215
216         return (ret);
217     }
218
219     /**
220      * Read the next element
221      * @return Node - The next node
222          */

223     public Node readElement() throws ParserException
224     {
225         return (readElement(false));
226     }
227
228     /**
229      * Read the next element
230      * @param balance_quotes If <code>true</code> string nodes are parsed
231      * paying attention to single and double quotes, such that tag-like
232      * strings are ignored if they are quoted.
233      * @return Node - The next node
234          */

235     public Node readElement(boolean balance_quotes) throws ParserException
236     {
237         try
238         {
239             if (nextParsedNode.size() > 0)
240             {
241                 node = nextParsedNode.elementAt(0);
242                 nextParsedNode.remove(0);
243                 return node;
244             }
245             if (readNextLine())
246             {
247                 do
248                 {
249                     line = getNextLine();
250                 }
251                 while (line != null && line.length() == 0);
252
253             }
254             else if (dontReadNextLine)
255             {
256                 dontReadNextLine = false;
257             }
258             else
259                 posInLine = getLastReadPosition() + 1;
260             if (line == null)
261                 return null;
262
263             if (beginTag(line, posInLine))
264             {
265                 node = remarkNodeParser.find(this, line, posInLine);
266                 if (node != null)
267                     return node;
268                 node = Tag.find(this, line, posInLine);
269                 if (node != null)
270                 {
271                     Tag tag = (Tag) node;
272                     try
273                     {
274                         node = tag.scan(parser.getScanners(), url, this);
275                         return node;
276                     }
277                     catch (Exception JavaDoc e)
278                     {
279                         StringBuffer JavaDoc msgBuffer = new StringBuffer JavaDoc();
280                         msgBuffer.append(
281                             DECIPHER_ERROR
282                                 + "\n"
283                                 + " Tag being processed : "
284                                 + tag.getTagName()
285                                 + "\n"
286                                 + " Current Tag Line : "
287                                 + tag.getTagLine());
288                         appendLineDetails(msgBuffer);
289                         ParserException ex =
290                             new ParserException(msgBuffer.toString(), e);
291
292                         parser.getFeedback().error(msgBuffer.toString(), ex);
293                         throw ex;
294                     }
295                 }
296
297                 node = EndTag.find(line, posInLine);
298                 if (node != null)
299                     return node;
300             }
301             else
302             {
303                 node = stringParser.find(this, line, posInLine, balance_quotes);
304                 if (node != null)
305                     return node;
306             }
307
308             return null;
309         }
310         catch (ParserException pe)
311         {
312             throw pe;
313         }
314         catch (Exception JavaDoc e)
315         {
316             StringBuffer JavaDoc msgBuffer =
317                 new StringBuffer JavaDoc("NodeReader.readElement() : Error occurred while trying to read the next element,");
318             StringWriter JavaDoc sw = new StringWriter JavaDoc();
319             e.printStackTrace(new PrintWriter JavaDoc(sw));
320             appendLineDetails(msgBuffer);
321             msgBuffer.append("\n Caused by:\n").append(
322                 sw.getBuffer().toString());
323             ParserException ex = new ParserException(msgBuffer.toString(), e);
324             parser.getFeedback().error(msgBuffer.toString(), ex);
325             throw ex;
326         }
327     }
328     public void appendLineDetails(StringBuffer JavaDoc msgBuffer)
329     {
330         msgBuffer.append("\nat Line ");
331         msgBuffer.append(getLineCount());
332         msgBuffer.append(" : ");
333         msgBuffer.append(getLine());
334         msgBuffer.append("\nPrevious Line ").append(getLineCount() - 1);
335         msgBuffer.append(" : ").append(getPreviousLine());
336     }
337     /**
338      * Do we need to read the next line ?
339      * @return true - yes/ false - no
340      */

341     protected boolean readNextLine()
342     {
343         if (dontReadNextLine)
344         {
345             return false;
346         }
347         if (posInLine == -1
348             || (line != null && node.elementEnd() + 1 >= line.length()))
349             return true;
350         else
351             return false;
352     }
353     /**
354      * The setParser method is used by the parser to put its own object into the reader. This happens internally,
355      * so this method is not generally for use by the developer or the user.
356      */

357     public void setParser(Parser newParser)
358     {
359         parser = newParser;
360     }
361     /**
362      * Sets the previousOpenScanner.
363      * @param previousOpenScanner The previousOpenScanner to set
364      */

365     public void setPreviousOpenScanner(TagScanner previousOpenScanner)
366     {
367         this.previousOpenScanner = previousOpenScanner;
368     }
369
370     /**
371      * @param lineSeparator New Line separator to be used
372      */

373     public static void setLineSeparator(String JavaDoc lineSeparator)
374     {
375         Node.setLineSeparator(lineSeparator);
376     }
377
378     /**
379      * Gets the line seperator that is being used
380      * @return String
381      */

382     public static String JavaDoc getLineSeparator()
383     {
384         return (Node.getLineSeparator());
385     }
386     /**
387      * Returns the lineCount.
388      * @return int
389      */

390     public int getLineCount()
391     {
392         return lineCount;
393     }
394
395     /**
396      * Returns the previousLine.
397      * @return String
398      */

399     public String JavaDoc getPreviousLine()
400     {
401         return previousLine;
402     }
403
404     /**
405      * Returns the line.
406      * @return String
407      */

408     public String JavaDoc getLine()
409     {
410         return line;
411     }
412
413     /**
414      * Sets the lineCount.
415      * @param lineCount The lineCount to set
416      */

417     public void setLineCount(int lineCount)
418     {
419         this.lineCount = lineCount;
420     }
421
422     /**
423      * Sets the posInLine.
424      * @param posInLine The posInLine to set
425      */

426     public void setPosInLine(int posInLine)
427     {
428         this.posInLine = posInLine;
429     }
430
431     public void reset() throws IOException JavaDoc
432     {
433         super.reset();
434         lineCount = 1;
435         posInLine = -1;
436     }
437
438     public StringParser getStringParser()
439     {
440         return stringParser;
441     }
442
443     /**
444      * Adds the given node on the front of an internal list of pre-parsed nodes.
445      * Used in recursive calls where downstream nodes have been recognized in
446      * order to parse the current node.
447      * @param nextParsedNode The node that will be returned next by the reader.
448      */

449     public void addNextParsedNode(Node nextParsedNode)
450     {
451         this.nextParsedNode.prepend(nextParsedNode);
452     }
453
454     public boolean isDontReadNextLine()
455     {
456         return dontReadNextLine;
457     }
458
459     public void setDontReadNextLine(boolean dontReadNextLine)
460     {
461         this.dontReadNextLine = dontReadNextLine;
462     }
463
464 }
465
Popular Tags