KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > search > crawler > HTMLHandler


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: HTMLHandler.java 42598 2004-03-01 16:18:28Z gregor $ */
19
20 package org.apache.lenya.search.crawler;
21
22 import java.io.BufferedReader JavaDoc;
23 import java.io.InputStream JavaDoc;
24 import java.io.InputStreamReader JavaDoc;
25 import java.text.ParseException JavaDoc;
26 import java.text.SimpleDateFormat JavaDoc;
27 import java.util.ArrayList JavaDoc;
28 import java.util.List JavaDoc;
29
30 import javax.swing.text.MutableAttributeSet JavaDoc;
31 import javax.swing.text.html.HTML JavaDoc;
32 import javax.swing.text.html.HTML.Tag;
33 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
34 import javax.swing.text.html.parser.ParserDelegator JavaDoc;
35
36
37 /**
38  * DOCUMENT ME!
39  */

40 public final class HTMLHandler extends ParserCallback implements ContentHandler {
41     private static final char space = ' ';
42     private static final char NONE = 0;
43     private static final char TITLE = 1;
44     private static final char HREF = 2;
45     private static final char SCRIPT = 3;
46     private static ParserDelegator JavaDoc pd = new ParserDelegator JavaDoc();
47
48     // Content
49
private String JavaDoc title;
50     private String JavaDoc description;
51     private String JavaDoc keywords;
52     private String JavaDoc categories;
53     private long published;
54     private String JavaDoc href;
55     private String JavaDoc author;
56     private StringBuffer JavaDoc contents;
57     private ArrayList JavaDoc links;
58
59     // Robot Instructions
60
private boolean robotIndex;
61     private boolean robotFollow;
62     private char state;
63     private SimpleDateFormat JavaDoc dateFormatter;
64
65     /**
66      * Constructor - initializes variables
67      */

68     public HTMLHandler() {
69         contents = new StringBuffer JavaDoc();
70
71         links = new ArrayList JavaDoc();
72
73         published = -1;
74
75         // 1996.07.10 15:08:56 PST
76
dateFormatter = new SimpleDateFormat JavaDoc("yyyy.MM.dd HH:mm:ss z");
77     }
78
79     /**
80      * Parse Content. [24] 320:1
81      *
82      * @return DOCUMENT ME!
83      */

84     public String JavaDoc getAuthor() {
85         return author;
86     }
87
88     /**
89      * Return categories (from META tags)
90      *
91      * @return DOCUMENT ME!
92      */

93     public String JavaDoc getCategories() {
94         return this.categories;
95     }
96
97     /**
98      * Return contents
99      *
100      * @return DOCUMENT ME!
101      */

102     public String JavaDoc getContents() {
103         return this.contents.toString();
104     }
105
106     /**
107      * Return description (from META tags)
108      *
109      * @return DOCUMENT ME!
110      */

111     public String JavaDoc getDescription() {
112         return this.description;
113     }
114
115     /**
116      * Return META HREF
117      *
118      * @return DOCUMENT ME!
119      */

120     public String JavaDoc getHREF() {
121         return this.href;
122     }
123
124     /**
125      * Return keywords (from META tags)
126      *
127      * @return DOCUMENT ME!
128      */

129     public String JavaDoc getKeywords() {
130         return this.keywords;
131     }
132
133     /**
134      * Return links
135      *
136      * @return DOCUMENT ME!
137      */

138     public List JavaDoc getLinks() {
139         return links;
140     }
141
142     /**
143      * Return published date (from META tag)
144      *
145      * @return DOCUMENT ME!
146      */

147     public long getPublished() {
148         return this.published;
149     }
150
151     /**
152      * Return boolean true if links are to be followed
153      *
154      * @return DOCUMENT ME!
155      */

156     public boolean getRobotFollow() {
157         return this.robotFollow;
158     }
159
160     /**
161      * Return boolean true if this is to be indexed
162      *
163      * @return DOCUMENT ME!
164      */

165     public boolean getRobotIndex() {
166         return this.robotIndex;
167     }
168
169     /**
170      * Return page title
171      *
172      * @return DOCUMENT ME!
173      */

174     public String JavaDoc getTitle() {
175         return this.title;
176     }
177
178     /**
179      * Handle Anchor <A HREF="~"></A> tags
180      *
181      * @param attribs DOCUMENT ME!
182      */

183     public void handleAnchor(MutableAttributeSet JavaDoc attribs) {
184         String JavaDoc href = new String JavaDoc();
185
186         href = (String JavaDoc) attribs.getAttribute(HTML.Attribute.HREF);
187
188         if (href == null) {
189             return;
190         }
191
192         links.add(href);
193
194         state = HREF;
195     }
196
197     /**
198      * Closing tag
199      *
200      * @param tag DOCUMENT ME!
201      * @param pos DOCUMENT ME!
202      */

203     public void handleEndTag(Tag tag, int pos) {
204         if (state == NONE) {
205             return;
206         }
207
208         // In order of precedence == > && > ||
209
if ((state == TITLE) && tag.equals(HTML.Tag.TITLE)) {
210             state = NONE;
211
212             return;
213         }
214
215         if ((state == HREF) && tag.equals(HTML.Tag.A)) {
216             //links.add(linktext);
217
state = NONE;
218
219             return;
220         }
221
222         if ((state == SCRIPT) && tag.equals(HTML.Tag.SCRIPT)) {
223             state = NONE;
224
225             return;
226         }
227     }
228
229     /**
230      * Handle META tags
231      *
232      * @param attribs DOCUMENT ME!
233      */

234     public void handleMeta(MutableAttributeSet JavaDoc attribs) {
235         String JavaDoc name = new String JavaDoc();
236
237         String JavaDoc content = new String JavaDoc();
238
239         name = (String JavaDoc) attribs.getAttribute(HTML.Attribute.NAME);
240
241         content = (String JavaDoc) attribs.getAttribute(HTML.Attribute.CONTENT);
242
243         if ((name == null) || (content == null)) {
244             return;
245         }
246
247         name = name.toUpperCase();
248
249         if (name.equals("DESCRIPTION")) {
250             description = content;
251
252             return;
253         }
254
255         if (name.equals("KEYWORDS")) {
256             keywords = content;
257
258             return;
259         }
260
261         if (name.equals("CATEGORIES")) {
262             categories = content;
263
264             return;
265         }
266
267         if (name.equals("PUBLISHED")) {
268             try {
269                 published = dateFormatter.parse(content).getTime();
270             } catch (ParseException JavaDoc e) {
271                 e.printStackTrace();
272             }
273
274             return;
275         }
276
277         if (name.equals("HREF")) {
278             href = content;
279
280             return;
281         }
282
283         if (name.equals("AUTHOR")) {
284             author = content;
285
286             return;
287         }
288
289         if (name.equals("ROBOTS")) {
290             if (content.indexOf("noindex") != -1) {
291                 robotIndex = false;
292             }
293
294             if (content.indexOf("nofollow") != -1) {
295                 robotFollow = false;
296             }
297
298             author = content;
299
300             return;
301         }
302     }
303
304     /**
305      * Handle standalone tags
306      *
307      * @param tag DOCUMENT ME!
308      * @param attribs DOCUMENT ME!
309      * @param pos DOCUMENT ME!
310      */

311     public void handleSimpleTag(Tag tag, MutableAttributeSet JavaDoc attribs, int pos) {
312         if (tag.equals(HTML.Tag.META)) {
313             handleMeta(attribs);
314         }
315     }
316
317     /**
318      * Opening tag
319      *
320      * @param tag DOCUMENT ME!
321      * @param attribs DOCUMENT ME!
322      * @param pos DOCUMENT ME!
323      */

324     public void handleStartTag(Tag tag, MutableAttributeSet JavaDoc attribs, int pos) {
325         if (tag.equals(HTML.Tag.TITLE)) {
326             state = TITLE;
327         } else if (tag.equals(HTML.Tag.A)) {
328             handleAnchor(attribs);
329         } else if (tag.equals(HTML.Tag.SCRIPT)) {
330             state = SCRIPT;
331         }
332     }
333
334     /**
335      * Handle page text
336      *
337      * @param text DOCUMENT ME!
338      * @param pos DOCUMENT ME!
339      */

340     public void handleText(char[] text, int pos) {
341         switch (state) {
342         case NONE:
343             contents.append(text);
344             contents.append(space);
345
346             break;
347
348         case TITLE:
349             title = new String JavaDoc(text);
350
351             break;
352
353         case HREF:
354             contents.append(text);
355             contents.append(space);
356
357             //linktext = new String(text);
358
break;
359         }
360     }
361
362     /**
363      * Parse Content.
364      *
365      * @param in DOCUMENT ME!
366      */

367     public void parse(InputStream JavaDoc in) {
368         try {
369             reset();
370
371             pd.parse(new BufferedReader JavaDoc(new InputStreamReader JavaDoc(in)), this, true);
372         } catch (Exception JavaDoc e) {
373             e.printStackTrace();
374         }
375     }
376
377     /**
378      * Return contents
379      */

380     private void reset() {
381         title = null;
382
383         description = null;
384
385         keywords = null;
386
387         categories = null;
388
389         href = null;
390
391         author = null;
392
393         contents.setLength(0);
394
395         links = new ArrayList JavaDoc();
396
397         published = -1;
398
399         // Robot Instructions
400
robotIndex = true;
401
402         robotFollow = true;
403
404         state = NONE;
405     }
406 }
407
Popular Tags