KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > scanners > LinkScanner


1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/LinkScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
2
/*
3  * ====================================================================
4  * Copyright 2002-2004 The Apache Software Foundation.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  */

19
20 // The developers of JMeter and Apache are greatful to the developers
21
// of HTMLParser for giving Apache Software Foundation a non-exclusive
22
// license. The performance benefits of HTMLParser are clear and the
23
// users of JMeter will benefit from the hard work the HTMLParser
24
// team. For detailed information about HTMLParser, the project is
25
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
26
//
27
// HTMLParser was originally created by Somik Raha in 2000. Since then
28
// a healthy community of users has formed and helped refine the
29
// design so that it is able to tackle the difficult task of parsing
30
// dirty HTML. Derrick Oswald is the current lead developer and was kind
31
// enough to assist JMeter.
32

33 package org.htmlparser.scanners;
34
35 //////////////////
36
// Java Imports //
37
//////////////////
38
import java.util.Hashtable JavaDoc;
39
40 import org.htmlparser.tags.LinkTag;
41 import org.htmlparser.tags.Tag;
42 import org.htmlparser.tags.data.CompositeTagData;
43 import org.htmlparser.tags.data.LinkData;
44 import org.htmlparser.tags.data.TagData;
45 import org.htmlparser.util.LinkProcessor;
46 import org.htmlparser.util.ParserException;
47 import org.htmlparser.util.ParserUtils;
48
49 /**
50  * Scans for the Link Tag. This is a subclass of TagScanner, and is called using a
51  * variant of the template method. If the evaluate() method returns true, that means the
52  * given string contains an image tag. Extraction is done by the scan method thereafter
53  * by the user of this class.
54  */

55 public class LinkScanner extends CompositeTagScanner
56 {
57     private static final String JavaDoc MATCH_NAME[] = { "A" };
58     public static final String JavaDoc LINK_SCANNER_ID = "A";
59     public static final String JavaDoc DIRTY_TAG_MESSAGE =
60         " is a dirty link tag - the tag was not closed. \nWe encountered an open tag, before the previous end tag was found.\nCorrecting this..";
61     private LinkProcessor processor;
62     private final static String JavaDoc ENDERS[] =
63         { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
64     private final static String JavaDoc ENDTAG_ENDERS[] =
65         { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
66
67     /**
68      * Overriding the default constructor
69      */

70     public LinkScanner()
71     {
72         this("");
73     }
74
75     /**
76      * Overriding the constructor to accept the filter
77      */

78     public LinkScanner(String JavaDoc filter)
79     {
80         super(filter, MATCH_NAME, ENDERS, ENDTAG_ENDERS, false);
81         processor = new LinkProcessor();
82     }
83
84     public Tag createTag(TagData tagData, CompositeTagData compositeTagData)
85         throws ParserException
86     {
87
88         String JavaDoc link =
89             extractLink(
90                 compositeTagData.getStartTag(),
91                 tagData.getUrlBeingParsed());
92         int mailto = link.indexOf("mailto");
93         boolean mailLink = false;
94         if (mailto == 0)
95         {
96             // yes it is
97
mailto = link.indexOf(":");
98             link = link.substring(mailto + 1);
99             mailLink = true;
100         }
101         int javascript = link.indexOf("javascript:");
102         boolean javascriptLink = false;
103         if (javascript == 0)
104         {
105             link = link.substring(11);
106             // this magic number is "javascript:".length()
107
javascriptLink = true;
108         }
109         String JavaDoc accessKey = getAccessKey(compositeTagData.getStartTag());
110         String JavaDoc myLinkText = compositeTagData.getChildren().toString();
111
112         LinkTag linkTag =
113             new LinkTag(
114                 tagData,
115                 compositeTagData,
116                 new LinkData(
117                     link,
118                     myLinkText,
119                     accessKey,
120                     mailLink,
121                     javascriptLink));
122         linkTag.setThisScanner(this);
123         return linkTag;
124     }
125
126     /**
127      * Template Method, used to decide if this scanner can handle the Link tag type. If
128      * the evaluation returns true, the calling side makes a call to scan().
129      * @param s The complete text contents of the Tag.
130      * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current
131      * scan has begun, and hence allows us to write scanners that can work with dirty html
132      */

133     public boolean evaluate(String JavaDoc s, TagScanner previousOpenScanner)
134     {
135         char ch;
136         boolean ret;
137
138         // eat up leading blanks
139
s = absorbLeadingBlanks(s);
140         if (5 > s.length())
141             ret = false;
142         else
143         {
144             ch = s.charAt(0);
145             if ((ch == 'a' || ch == 'A')
146                 && Character.isWhitespace(s.charAt(1)))
147                 ret = -1 != s.toUpperCase().indexOf("HREF");
148             else
149                 ret = false;
150         }
151
152         return (ret);
153     }
154
155     /**
156      * Extract the link from the given string. The URL of the actual html page is also
157      * provided.
158      */

159     public String JavaDoc extractLink(Tag tag, String JavaDoc url) throws ParserException
160     {
161         try
162         {
163             Hashtable JavaDoc table = tag.getAttributes();
164             String JavaDoc relativeLink = (String JavaDoc) table.get("HREF");
165             if (relativeLink != null)
166             {
167                 relativeLink = ParserUtils.removeChars(relativeLink, '\n');
168                 relativeLink = ParserUtils.removeChars(relativeLink, '\r');
169             }
170             return processor.extract(relativeLink, url);
171         }
172         catch (Exception JavaDoc e)
173         {
174             String JavaDoc msg;
175             if (tag != null)
176                 msg = tag.getText();
177             else
178                 msg = "null";
179             throw new ParserException(
180                 "HTMLLinkScanner.extractLink() : Error while extracting link from tag "
181                     + msg
182                     + ", url = "
183                     + url,
184                 e);
185         }
186     }
187
188     /**
189      * Extract the access key from the given tag.
190      * @param text Text to be parsed to pick out the access key.
191      * @return The value of the ACCESSKEY attribute.
192      */

193     private String JavaDoc getAccessKey(Tag tag)
194     {
195         return tag.getAttribute("ACCESSKEY");
196     }
197
198     public BaseHrefScanner createBaseHREFScanner(String JavaDoc filter)
199     {
200         return new BaseHrefScanner(filter, processor);
201     }
202
203     public ImageScanner createImageScanner(String JavaDoc filter)
204     {
205         return new ImageScanner(filter, processor);
206     }
207
208     /**
209      * @see org.htmlparser.scanners.TagScanner#getID()
210      */

211     public String JavaDoc[] getID()
212     {
213         return MATCH_NAME;
214     }
215
216 }
217
Popular Tags