KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > objectweb > clif > protocol > http > lib > HtmlParser


1 /*
2 * CLIF is a Load Injection Framework
3 * Copyright (C) 2003 France Telecom R&D
4 * Copyright (C) 2003 INRIA
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * CLIF $Name: $
21 *
22 * Contact: clif@objectweb.org
23 *
24 * @authors: Julien Buret
25 * @authors: Nicolas Droze
26 */

27
28 package org.objectweb.clif.protocol.http.lib;
29
30 // Imports from htmlparser v1.2
31
import org.htmlparser.HTMLNode;
32 import org.htmlparser.HTMLParser;
33 import org.htmlparser.HTMLReader;
34 import org.htmlparser.scanners.HTMLLinkScanner;
35 import org.htmlparser.tags.HTMLFormTag;
36 import org.htmlparser.tags.HTMLInputTag;
37 import org.htmlparser.tags.HTMLLinkTag;
38 import org.htmlparser.util.DefaultHTMLParserFeedback;
39 import org.htmlparser.util.HTMLEnumeration;
40
41 import java.io.StringReader;
42 import java.util.Enumeration;
43 import java.util.Hashtable;
44 import java.util.Vector;
45
46 /**
47  * This class provides useful methods for parsing HTML pages and retrieve
48  * specific informations. The basic functionning of the following methods is:<br>
49  * - The method takes in argument the content of an HTML page, and return
50  * an ActionReport with the results.
51  * @author Julien Buret
52  * @author Nicolas Droze
53  */

54 public class HtmlParser {
55
56     private HTMLReader reader;
57     private HTMLParser parser;
58     private DefaultHTMLParserFeedback feedback;
59     private Vector links = new Vector();
60     private Vector formNodes = new Vector();
61     private Vector form_inputs = new Vector();
62     private HTMLEnumeration e;
63     private Enumeration enum;
64     private Enumeration enum2;
65     private HTMLNode node = null;
66     private HTMLLinkTag linkTag = null;
67     private HTMLNode htmlNode = null;
68     private HTMLFormTag formTag = null;
69     private HTMLInputTag inputTag = null;
70     private Hashtable inputParameters = new Hashtable();
71     private String formName = null;
72     private Vector fullResults = new Vector();
73     private HtmlGenericTag genericTag = null;
74     private Hashtable parameters = null;
75     private Vector param = new Vector();
76     private Object[] resultArray;
77
78     private static HtmlParser htmlParser=null;
79
80     private HtmlParser(){
81         feedback=new DefaultHTMLParserFeedback(DefaultHTMLParserFeedback.QUIET);
82     }
83
84     public static HtmlParser getInstance(){
85         if(htmlParser==null){
86             htmlParser=new HtmlParser();
87         }
88         return htmlParser;
89     }
90
91     /**
92      * This method retrieve all HTML links by parsing an HTML page.
93      * @see org.objectweb.clif.httpusersession.HttpClientWrapper#getLinks(java.lang.String)
94      * @param arg The HTML response body to parse
95      * @return An ActionReport containing: <br>
96      * - An array with all HTML links
97      * @throws e An Exception thrown by the Parser
98      */

99     public Object[] getLinks(String arg) throws Exception {
100
101         reader=new HTMLReader(new StringReader(arg), 50);
102         parser=new HTMLParser(reader, feedback);
103         parser.addScanner(new HTMLLinkScanner("-l"));
104
105         links = new Vector();
106
107         for (e = parser.elements(); e.hasMoreNodes();) {
108             node = e.nextHTMLNode();
109             if (node instanceof HTMLLinkTag) {
110                 linkTag = (HTMLLinkTag) node;
111                 links.add(linkTag.getLink());
112             }
113         }
114
115         return links.toArray();
116     }
117
118     /**
119      * This method retrieve all INPUT nodes from FORMS by parsing an HTML page.
120      * @see org.objectweb.clif.httpusersession.HttpClientWrapper#getFields(java.lang.String)
121      * @param arg The HTML response body to parse
122      * @return An ActionReport containing: <br>
123      * - An array with all INPUT from all FORMS
124      * @throws e An Exception thrown by the Parser
125      */

126     public Object[] getFields(String arg) throws Exception {
127
128         reader = new HTMLReader(new StringReader(arg), 50);
129         parser = new HTMLParser(reader, feedback);
130
131         parser.registerScanners();
132
133         formNodes = new Vector();
134
135         // Store all FORM tags and their content
136
for (e = parser.elements(); e.hasMoreNodes();) {
137             htmlNode = e.nextHTMLNode();
138             if (htmlNode instanceof HTMLFormTag) {
139                 htmlNode.collectInto(formNodes, "-f");
140             }
141         }
142
143
144         form_inputs = new Vector();
145         inputParameters = new Hashtable();
146         fullResults = new Vector();
147
148         // For each FORM tag found, store all INPUT fields
149
for (enum = formNodes.elements(); enum.hasMoreElements();) {
150             formTag = (HTMLFormTag) enum.nextElement();
151             form_inputs = formTag.getFormInputs();
152             formName = formTag.getFormName();
153
154             // For each INPUT tag found, store the datas for further response
155
for (enum2 = form_inputs.elements();
156             enum2.hasMoreElements();
157                 ) {
158                 inputTag = (HTMLInputTag) enum2.nextElement();
159                 inputParameters = inputTag.parseParameters();
160                 inputParameters.remove("$<TAGNAME>$");
161
162                 // If the form has no name, we get its location parameter instead
163
if (formName == null)
164                     formName = formTag.getFormLocation();
165
166                 // For this input we store the form name (or location)
167
inputParameters.put("FORM_NAME", formName);
168                 // And we add the input parameters
169
fullResults.add(inputParameters);
170             }
171         }
172
173         // Constructs the response
174
return fullResults.toArray();
175     }
176
177     /**
178      * This generic method retrieve informations about a specific HTML tag
179      * @see org.objectweb.clif.httpusersession.HtmlParser#getTags(java.lang.String[])
180      * @param arg An array containing: <br>
181      * + The HTML response body to parse <br>
182      * + The HTML tag to retrieve
183      * @return An ActionReport containing: - An array with all the parameters for each tag retrieved
184      * @throws e An Exception thrown by the Parser
185      */

186     public Object[] getTags(String[] arg) throws Exception {
187
188         reader = new HTMLReader(new StringReader(arg[0]), 50);
189         parser = new HTMLParser(reader, feedback);
190         parser.addScanner(new HtmlGenericScanner("", arg[1]));
191
192         param = new Vector();
193
194         for (e = parser.elements(); e.hasMoreNodes();) {
195             node = e.nextHTMLNode();
196             if (node instanceof HtmlGenericTag) {
197                 genericTag = (HtmlGenericTag) node;
198                 parameters = genericTag.parseParameters();
199                 // Remove the tag name from the hashtable
200
parameters.remove("$<TAGNAME>$");
201                 param.add(parameters);
202             }
203
204         }
205
206         // Constructs the response
207
resultArray = new Object[2];
208
209         // Store the parameters of the tag
210
resultArray[0] = param.toArray();
211
212         // Store the content of the tag
213
resultArray[1] = null; // Not implemented
214

215         return resultArray;
216     }
217 }
218
Popular Tags