NodeReader


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/NodeReader.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  
34  package org.htmlparser;
35  
36  //////////////////
37  // Java Imports //
38  //////////////////
39  import java.io.BufferedReader  ;
40  import java.io.IOException  ;
41  import java.io.PrintWriter  ;
42  import java.io.Reader  ;
43  import java.io.StringWriter  ;
44  
45  import org.htmlparser.parserHelper.StringParser;
46  import org.htmlparser.scanners.TagScanner;
47  import org.htmlparser.tags.EndTag;
48  import org.htmlparser.tags.Tag;
49  import org.htmlparser.util.NodeList;
50  import org.htmlparser.util.ParserException;
51  
52  /**
53   * NodeReader builds on the BufferedReader, providing methods to read one element
54   * at a time
55   */
56  public class NodeReader extends BufferedReader  
57  {
58      public static final String   DECIPHER_ERROR =
59          "NodeReader.readElement() : Error occurred while trying to decipher the tag using scanners";
60      protected int posInLine = -1;
61      protected String   line;
62      protected Node node = null;
63      protected TagScanner previousOpenScanner = null;
64      protected String   url;
65      private Parser parser;
66      private int lineCount;
67      private String   previousLine;
68      private StringParser stringParser = new StringParser();
69      private RemarkNodeParser remarkNodeParser = new RemarkNodeParser();
70      private NodeList nextParsedNode = new NodeList();
71      private boolean dontReadNextLine = false;
72      /**
73       * The constructor takes in a reader object, it's length and the url to be read.
74       */
75      public NodeReader(Reader   in, int len, String   url)
76      {
77          super(in, len);
78          this.url = url;
79          this.parser = null;
80          this.lineCount = 1;
81      }
82      /**
83       * This constructor basically overrides the existing constructor in the
84       * BufferedReader class.
85       * The URL defaults to an empty string.
86       * @see #NodeReader(Reader,int,String)
87       */
88  
89      public NodeReader(Reader   in, int len)
90      {
91          this(in, len, "");
92      }
93      /**
94       * The constructor takes in a reader object, and the url to be read.
95       * The buffer size defaults to 8192.
96       * @see #NodeReader(Reader,int,String)
97       */
98      public NodeReader(Reader   in, String   url)
99      {
100         this(in, 8192, url);
101     }
102 
103     /**
104      * Get the url for this reader.
105      * @return The url specified in the constructor;
106      */
107     public String   getURL()
108     {
109         return (url);
110     }
111 
112     /**
113      * This method is intended to be called only by scanners, when a situation of dirty html has arisen, 
114      * and action has been taken to correct the parsed tags. For e.g. if we have html of the form :
115      * <pre>
116      * <a HREF="somelink.html"><img SRC=...><td><tr><a HREF="someotherlink.html">...</a>
117      * </pre>
118      * Now to salvage the first link, we'd probably like to insert an end tag somewhere (typically before the
119      * second begin link tag). So that the parsing continues uninterrupted, we will need to change the existing
120      * line being parsed, to contain the end tag in it. 
121      */
122     public void changeLine(String   line)
123     {
124         this.line = line;
125     }
126     public String   getCurrentLine()
127     {
128         return line;
129     }
130     /**
131      * Get the last line number that the reader has read
132      * @return int last line number read by the reader
133      */
134     public int getLastLineNumber()
135     {
136         return lineCount - 1;
137     }
138 
139     /**
140      * This method is useful when designing your own scanners. You might need to find out what is the location where the
141      * reader has stopped last.
142      * @return int Last position read by the reader
143      */
144     public int getLastReadPosition()
145     {
146         if (node != null)
147             return node.elementEnd();
148         else
149             return 0;
150     }
151 
152     /*
153      * Read the next line
154      * @return String containing the line
155      */
156     public String   getNextLine()
157     {
158         try
159         {
160             previousLine = line;
161             line = readLine();
162             if (line != null)
163                 lineCount++;
164             posInLine = 0;
165             return line;
166         }
167         catch (IOException   e)
168         {
169             System.err.println("I/O Exception occurred while reading!");
170         }
171         return null;
172     }
173     /**
174      * Returns the parser object for which this reader exists
175      * @return org.htmlparser.Parser
176      */
177     public Parser getParser()
178     {
179         return parser;
180     }
181     /**
182      * Gets the previousOpenScanner.
183      * @return Returns a TagScanner
184      */
185     public TagScanner getPreviousOpenScanner()
186     {
187         return previousOpenScanner;
188     }
189 
190     /**
191      * Returns true if the text at <code>pos</code> in <code>line</code> should be scanned as a tag.
192      * Basically an open angle followed by a known special character or a letter.
193      * @param line The current line being parsed.
194      * @param pos The position in the line to examine.
195      * @return <code>true</code> if we think this is the start of a tag.
196      */
197     private boolean beginTag(String   line, int pos)
198     {
199         char ch;
200         boolean ret;
201 
202         ret = false;
203 
204         if (pos + 2 <= line.length())
205             if ('<' == line.charAt(pos))
206             {
207                 ch = line.charAt(pos + 1);
208                 // the order of these tests might be optimized for speed
209                 if ('/' == ch
210                     || '%' == ch
211                     || Character.isLetter(ch)
212                     || '!' == ch)
213                     ret = true;
214             }
215 
216         return (ret);
217     }
218 
219     /**
220      * Read the next element
221      * @return Node - The next node
222          */
223     public Node readElement() throws ParserException
224     {
225         return (readElement(false));
226     }
227 
228     /**
229      * Read the next element
230      * @param balance_quotes If <code>true</code> string nodes are parsed
231      * paying attention to single and double quotes, such that tag-like
232      * strings are ignored if they are quoted.
233      * @return Node - The next node
234          */
235     public Node readElement(boolean balance_quotes) throws ParserException
236     {
237         try
238         {
239             if (nextParsedNode.size() > 0)
240             {
241                 node = nextParsedNode.elementAt(0);
242                 nextParsedNode.remove(0);
243                 return node;
244             }
245             if (readNextLine())
246             {
247                 do
248                 {
249                     line = getNextLine();
250                 }
251                 while (line != null && line.length() == 0);
252 
253             }
254             else if (dontReadNextLine)
255             {
256                 dontReadNextLine = false;
257             }
258             else
259                 posInLine = getLastReadPosition() + 1;
260             if (line == null)
261                 return null;
262 
263             if (beginTag(line, posInLine))
264             {
265                 node = remarkNodeParser.find(this, line, posInLine);
266                 if (node != null)
267                     return node;
268                 node = Tag.find(this, line, posInLine);
269                 if (node != null)
270                 {
271                     Tag tag = (Tag) node;
272                     try
273                     {
274                         node = tag.scan(parser.getScanners(), url, this);
275                         return node;
276                     }
277                     catch (Exception   e)
278                     {
279                         StringBuffer   msgBuffer = new StringBuffer  ();
280                         msgBuffer.append(
281                             DECIPHER_ERROR
282                                 + "\n"
283                                 + "    Tag being processed : "
284                                 + tag.getTagName()
285                                 + "\n"
286                                 + "    Current Tag Line : "
287                                 + tag.getTagLine());
288                         appendLineDetails(msgBuffer);
289                         ParserException ex =
290                             new ParserException(msgBuffer.toString(), e);
291 
292                         parser.getFeedback().error(msgBuffer.toString(), ex);
293                         throw ex;
294                     }
295                 }
296 
297                 node = EndTag.find(line, posInLine);
298                 if (node != null)
299                     return node;
300             }
301             else
302             {
303                 node = stringParser.find(this, line, posInLine, balance_quotes);
304                 if (node != null)
305                     return node;
306             }
307 
308             return null;
309         }
310         catch (ParserException pe)
311         {
312             throw pe;
313         }
314         catch (Exception   e)
315         {
316             StringBuffer   msgBuffer =
317                 new StringBuffer  ("NodeReader.readElement() : Error occurred while trying to read the next element,");
318             StringWriter   sw = new StringWriter  ();
319             e.printStackTrace(new PrintWriter  (sw));
320             appendLineDetails(msgBuffer);
321             msgBuffer.append("\n Caused by:\n").append(
322                 sw.getBuffer().toString());
323             ParserException ex = new ParserException(msgBuffer.toString(), e);
324             parser.getFeedback().error(msgBuffer.toString(), ex);
325             throw ex;
326         }
327     }
328     public void appendLineDetails(StringBuffer   msgBuffer)
329     {
330         msgBuffer.append("\nat Line ");
331         msgBuffer.append(getLineCount());
332         msgBuffer.append(" : ");
333         msgBuffer.append(getLine());
334         msgBuffer.append("\nPrevious Line ").append(getLineCount() - 1);
335         msgBuffer.append(" : ").append(getPreviousLine());
336     }
337     /**
338      * Do we need to read the next line ?
339      * @return true - yes/ false - no
340      */
341     protected boolean readNextLine()
342     {
343         if (dontReadNextLine)
344         {
345             return false;
346         }
347         if (posInLine == -1
348             || (line != null && node.elementEnd() + 1 >= line.length()))
349             return true;
350         else
351             return false;
352     }
353     /**
354      * The setParser method is used by the parser to put its own object into the reader. This happens internally,
355      * so this method is not generally for use by the developer or the user.
356      */
357     public void setParser(Parser newParser)
358     {
359         parser = newParser;
360     }
361     /**
362      * Sets the previousOpenScanner.
363      * @param previousOpenScanner The previousOpenScanner to set
364      */
365     public void setPreviousOpenScanner(TagScanner previousOpenScanner)
366     {
367         this.previousOpenScanner = previousOpenScanner;
368     }
369 
370     /**
371      * @param lineSeparator New Line separator to be used
372      */
373     public static void setLineSeparator(String   lineSeparator)
374     {
375         Node.setLineSeparator(lineSeparator);
376     }
377 
378     /**
379      * Gets the line seperator that is being used
380      * @return String
381      */
382     public static String   getLineSeparator()
383     {
384         return (Node.getLineSeparator());
385     }
386     /**
387      * Returns the lineCount.
388      * @return int
389      */
390     public int getLineCount()
391     {
392         return lineCount;
393     }
394 
395     /**
396      * Returns the previousLine.
397      * @return String
398      */
399     public String   getPreviousLine()
400     {
401         return previousLine;
402     }
403 
404     /**
405      * Returns the line.
406      * @return String
407      */
408     public String   getLine()
409     {
410         return line;
411     }
412 
413     /**
414      * Sets the lineCount.
415      * @param lineCount The lineCount to set
416      */
417     public void setLineCount(int lineCount)
418     {
419         this.lineCount = lineCount;
420     }
421 
422     /**
423      * Sets the posInLine.
424      * @param posInLine The posInLine to set
425      */
426     public void setPosInLine(int posInLine)
427     {
428         this.posInLine = posInLine;
429     }
430 
431     public void reset() throws IOException  
432     {
433         super.reset();
434         lineCount = 1;
435         posInLine = -1;
436     }
437 
438     public StringParser getStringParser()
439     {
440         return stringParser;
441     }
442 
443     /**
444      * Adds the given node on the front of an internal list of pre-parsed nodes.
445      * Used in recursive calls where downstream nodes have been recognized in
446      * order to parse the current node.
447      * @param nextParsedNode The node that will be returned next by the reader.
448      */
449     public void addNextParsedNode(Node nextParsedNode)
450     {
451         this.nextParsedNode.prepend(nextParsedNode);
452     }
453 
454     public boolean isDontReadNextLine()
455     {
456         return dontReadNextLine;
457     }
458 
459     public void setDontReadNextLine(boolean dontReadNextLine)
460     {
461         this.dontReadNextLine = dontReadNextLine;
462     }
463 
464 }
465
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags