KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > parserHelper > TagParser


1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserHelper/TagParser.java,v 1.2 2004/02/10 13:41:08 woolfel Exp $
2
/*
3  * ====================================================================
4  * Copyright 2002-2004 The Apache Software Foundation.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  */

19
20 // The developers of JMeter and Apache are greatful to the developers
21
// of HTMLParser for giving Apache Software Foundation a non-exclusive
22
// license. The performance benefits of HTMLParser are clear and the
23
// users of JMeter will benefit from the hard work the HTMLParser
24
// team. For detailed information about HTMLParser, the project is
25
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
26
//
27
// HTMLParser was originally created by Somik Raha in 2000. Since then
28
// a healthy community of users has formed and helped refine the
29
// design so that it is able to tackle the difficult task of parsing
30
// dirty HTML. Derrick Oswald is the current lead developer and was kind
31
// enough to assist JMeter.
32

33 package org.htmlparser.parserHelper;
34
35 import java.util.StringTokenizer JavaDoc;
36
37 import org.htmlparser.Node;
38 import org.htmlparser.NodeReader;
39 import org.htmlparser.tags.Tag;
40 import org.htmlparser.tags.data.TagData;
41 import org.htmlparser.util.ParserFeedback;
42
43 public class TagParser
44 {
45     public final static int TAG_BEFORE_PARSING_STATE = 1;
46     public final static int TAG_BEGIN_PARSING_STATE = 1 << 2;
47     public final static int TAG_FINISHED_PARSING_STATE = 1 << 3;
48     public final static int TAG_ILLEGAL_STATE = 1 << 4;
49     public final static int TAG_IGNORE_DATA_STATE = 1 << 5;
50     public final static int TAG_IGNORE_BEGIN_TAG_STATE = 1 << 6;
51     public final static int TAG_IGNORE_CHAR_SINGLE_QUOTE = 1 << 7;
52
53     public final static String JavaDoc ENCOUNTERED_QUERY_MESSAGE =
54         "TagParser : Encountered > after a query. Accepting without correction and continuing parsing";
55
56     private ParserFeedback feedback;
57
58     public TagParser(ParserFeedback feedback)
59     {
60         this.feedback = feedback;
61     }
62
63     public Tag find(NodeReader reader, String JavaDoc input, int position)
64     {
65         int state = TAG_BEFORE_PARSING_STATE;
66         int i = position;
67         char ch;
68         char[] ignorechar = new char[1];
69         // holds the character we're looking for when in TAG_IGNORE_DATA_STATE
70
Tag tag =
71             new Tag(
72                 new TagData(
73                     position,
74                     0,
75                     reader.getLastLineNumber(),
76                     0,
77                     "",
78                     input,
79                     "",
80                     false));
81
82         Bool encounteredQuery = new Bool(false);
83         while (i < tag.getTagLine().length()
84             && state != TAG_FINISHED_PARSING_STATE
85             && state != TAG_ILLEGAL_STATE)
86         {
87             ch = tag.getTagLine().charAt(i);
88             state =
89                 automataInput(
90                     encounteredQuery,
91                     i,
92                     state,
93                     ch,
94                     tag,
95                     i,
96                     ignorechar);
97             i = incrementCounter(i, reader, state, tag);
98         }
99         if (state == TAG_FINISHED_PARSING_STATE)
100         {
101             String JavaDoc tagLine = tag.getTagLine();
102             if (i > 1 && tagLine.charAt(i - 2) == '/')
103             {
104                 tag.setEmptyXmlTag(true);
105                 String JavaDoc tagContents = tag.getText();
106                 tag.setText(tagContents.substring(0, tagContents.length() - 1));
107             }
108             return tag;
109         }
110         else
111             return null;
112     }
113
114     private int automataInput(
115         Bool encounteredQuery,
116         int i,
117         int state,
118         char ch,
119         Tag tag,
120         int pos,
121         char[] ignorechar)
122     {
123         state = checkIllegalState(i, state, ch, tag);
124         state = checkFinishedState(encounteredQuery, i, state, ch, tag, pos);
125         state = toggleIgnoringState(state, ch, ignorechar);
126         if (state == TAG_BEFORE_PARSING_STATE && ch != '<')
127         {
128             state = TAG_ILLEGAL_STATE;
129         }
130         if (state == TAG_IGNORE_DATA_STATE && ch == '<')
131         {
132             // If the next tag char is is close tag, then
133
// this is legal, we should continue
134
if (!isWellFormedTag(tag, pos))
135                 state = TAG_IGNORE_BEGIN_TAG_STATE;
136         }
137         if (state == TAG_IGNORE_BEGIN_TAG_STATE && ch == '>')
138         {
139             state = TAG_IGNORE_DATA_STATE;
140         }
141         checkIfAppendable(encounteredQuery, state, ch, tag);
142         state = checkBeginParsingState(i, state, ch, tag);
143
144         return state;
145     }
146
147     private int checkBeginParsingState(int i, int state, char ch, Tag tag)
148     {
149         if (ch == '<'
150             && (state == TAG_BEFORE_PARSING_STATE || state == TAG_ILLEGAL_STATE))
151         {
152             // Transition from State 0 to State 1 - Record data till > is encountered
153
tag.setTagBegin(i);
154             state = TAG_BEGIN_PARSING_STATE;
155         }
156         return state;
157     }
158
159     private boolean isWellFormedTag(Tag tag, int pos)
160     {
161         String JavaDoc inputLine = tag.getTagLine();
162         int closeTagPos = inputLine.indexOf('>', pos + 1);
163         int openTagPos = inputLine.indexOf('<', pos + 1);
164         return openTagPos > closeTagPos
165             || (openTagPos == -1 && closeTagPos != -1);
166     }
167
168     private int checkFinishedState(
169         Bool encounteredQuery,
170         int i,
171         int state,
172         char ch,
173         Tag tag,
174         int pos)
175     {
176         if (ch == '>')
177         {
178             if (state == TAG_BEGIN_PARSING_STATE)
179             {
180                 state = TAG_FINISHED_PARSING_STATE;
181                 tag.setTagEnd(i);
182             }
183             else if (state == TAG_IGNORE_DATA_STATE)
184             {
185                 if (encounteredQuery.getBoolean())
186                 {
187                     encounteredQuery.setBoolean(false);
188                     feedback.info(ENCOUNTERED_QUERY_MESSAGE);
189                     return state;
190                 }
191                 // Now, either this is a valid > input, and should be ignored,
192
// or it is a mistake in the html, in which case we need to correct it *sigh*
193
if (isWellFormedTag(tag, pos))
194                     return state;
195
196                 state = TAG_FINISHED_PARSING_STATE;
197                 tag.setTagEnd(i);
198                 // Do Correction
199
// Correct the tag - assuming its grouped into name value pairs
200
// Remove all inverted commas.
201
correctTag(tag);
202
203                 StringBuffer JavaDoc msg = new StringBuffer JavaDoc();
204                 msg.append(
205                     "HTMLTagParser : Encountered > inside inverted commas in line \n");
206                 msg.append(tag.getTagLine());
207                 msg.append(", location ");
208                 msg.append(i);
209                 msg.append("\n");
210                 for (int j = 0; j < i; j++)
211                     msg.append(' ');
212                 msg.append('^');
213                 msg.append("\nAutomatically corrected.");
214                 feedback.warning(msg.toString());
215             }
216         }
217         else if (
218             ch == '<'
219                 && state == TAG_BEGIN_PARSING_STATE
220                 && tag.getText().charAt(0) != '%')
221         {
222             state = TAG_FINISHED_PARSING_STATE;
223             tag.setTagEnd(i - 1);
224             i--;
225         }
226         return state;
227     }
228
229     private void checkIfAppendable(
230         Bool encounteredQuery,
231         int state,
232         char ch,
233         Tag tag)
234     {
235         if (state == TAG_IGNORE_DATA_STATE
236             || state == TAG_BEGIN_PARSING_STATE
237             || state == TAG_IGNORE_BEGIN_TAG_STATE)
238         {
239             if (ch == '?')
240                 encounteredQuery.setBoolean(true);
241             tag.append(ch);
242         }
243     }
244
245     private int checkIllegalState(int i, int state, char ch, Tag tag)
246     {
247         if (ch == '/'
248             && i > 0
249             && tag.getTagLine().charAt(i - 1) == '<'
250             && state != TAG_IGNORE_DATA_STATE
251             && state != TAG_IGNORE_BEGIN_TAG_STATE)
252         {
253             state = TAG_ILLEGAL_STATE;
254         }
255
256         return state;
257     }
258
259     public void correctTag(Tag tag)
260     {
261         String JavaDoc tempText = tag.getText();
262         StringBuffer JavaDoc absorbedText = new StringBuffer JavaDoc();
263         char c;
264         for (int j = 0; j < tempText.length(); j++)
265         {
266             c = tempText.charAt(j);
267             if (c != '"')
268                 absorbedText.append(c);
269         }
270         // Go into the next stage.
271
StringBuffer JavaDoc result = insertInvertedCommasCorrectly(absorbedText);
272         tag.setText(result.toString());
273     }
274     public StringBuffer JavaDoc insertInvertedCommasCorrectly(StringBuffer JavaDoc absorbedText)
275     {
276         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
277         StringTokenizer JavaDoc tok =
278             new StringTokenizer JavaDoc(absorbedText.toString(), "=", false);
279         String JavaDoc token;
280         token = (String JavaDoc) tok.nextToken();
281         result.append(token + "=");
282         for (; tok.hasMoreTokens();)
283         {
284             token = (String JavaDoc) tok.nextToken();
285             token = pruneSpaces(token);
286             result.append('"');
287             int lastIndex = token.lastIndexOf(' ');
288             if (lastIndex != -1 && tok.hasMoreTokens())
289             {
290                 result.append(token.substring(0, lastIndex));
291                 result.append('"');
292                 result.append(token.substring(lastIndex, token.length()));
293             }
294             else
295                 result.append(token + '"');
296             if (tok.hasMoreTokens())
297                 result.append("=");
298         }
299         return result;
300     }
301     public static String JavaDoc pruneSpaces(String JavaDoc token)
302     {
303         int firstSpace;
304         int lastSpace;
305         firstSpace = token.indexOf(' ');
306         while (firstSpace == 0)
307         {
308             token = token.substring(1, token.length());
309             firstSpace = token.indexOf(' ');
310         }
311         lastSpace = token.lastIndexOf(' ');
312         while (lastSpace == token.length() - 1)
313         {
314             token = token.substring(0, token.length() - 1);
315             lastSpace = token.lastIndexOf(' ');
316         }
317         return token;
318     }
319
320     /**
321      * Check for quote character (" or ') and switch to TAG_IGNORE_DATA_STATE
322      * or back out to TAG_BEGIN_PARSING_STATE.
323      * @param state The current state.
324      * @param ch The character to test.
325      * @param ignorechar The character that caused entry to TAG_IGNORE_DATA_STATE.
326      */

327     private int toggleIgnoringState(int state, char ch, char[] ignorechar)
328     {
329         if (state == TAG_IGNORE_DATA_STATE)
330         {
331             if (ch == ignorechar[0])
332                 state = TAG_BEGIN_PARSING_STATE;
333         }
334         else if (state == TAG_BEGIN_PARSING_STATE)
335             if (ch == '"' || ch == '\'')
336             {
337                 state = TAG_IGNORE_DATA_STATE;
338                 ignorechar[0] = ch;
339             }
340
341         return (state);
342     }
343
344     public int incrementCounter(int i, NodeReader reader, int state, Tag tag)
345     {
346         String JavaDoc nextLine = null;
347         if ((state == TAG_BEGIN_PARSING_STATE
348             || state == TAG_IGNORE_DATA_STATE
349             || state == TAG_IGNORE_BEGIN_TAG_STATE)
350             && i == tag.getTagLine().length() - 1)
351         {
352             // The while loop below is a bug fix contributed by
353
// Annette Doyle - see testcase HTMLImageScannerTest.testImageTagOnMultipleLines()
354
// Further modified by Somik Raha, to remove bug - HTMLTagTest.testBrokenTag
355
int numLinesAdvanced = 0;
356             do
357             {
358                 nextLine = reader.getNextLine();
359                 numLinesAdvanced++;
360             }
361             while (nextLine != null && nextLine.length() == 0);
362             if (nextLine == null)
363             {
364                 // This means we have a broken tag. Fill in an end tag symbol here.
365
nextLine = ">";
366             }
367             else
368             {
369                 // This means this is just a new line, hence add the new line character
370
tag.append(Node.getLineSeparator());
371             }
372
373             // Ensure blank lines are included in tag's 'tagLines'
374
while (--numLinesAdvanced > 0)
375                 tag.setTagLine("");
376
377             // We need to continue parsing to the next line
378
tag.setTagLine(nextLine);
379             i = -1;
380         }
381         return ++i;
382     }
383     // Class provided for thread safety in TagParser
384
class Bool
385     {
386         private boolean boolValue;
387
388         Bool(boolean boolValue)
389         {
390             this.boolValue = boolValue;
391         }
392
393         public void setBoolean(boolean boolValue)
394         {
395             this.boolValue = boolValue;
396         }
397
398         public boolean getBoolean()
399         {
400             return boolValue;
401         }
402     }
403 }
404
Popular Tags