KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > fr > jayasoft > ivy > url > ApacheURLLister


1 /*
2  * This file is subject to the license found in LICENCE.TXT in the root directory of the project.
3  *
4  * #SNAPSHOT#
5  */

6 package fr.jayasoft.ivy.url;
7
8 import fr.jayasoft.ivy.util.FileUtil;
9 import fr.jayasoft.ivy.util.Message;
10
11 import java.io.BufferedReader JavaDoc;
12 import java.io.IOException JavaDoc;
13 import java.io.InputStreamReader JavaDoc;
14 import java.net.URL JavaDoc;
15 import java.util.ArrayList JavaDoc;
16 import java.util.List JavaDoc;
17 import java.util.regex.Matcher JavaDoc;
18 import java.util.regex.Pattern JavaDoc;
19
20
21 /**
22  * Utility class which helps to list urls under a given url. This has been
23  * tested with Apache 1.3.33 server listing, as the one used at ibiblio, and
24  * with Apache 2.0.53 server listing, as the one on mirrors.sunsite.dk.
25  *
26  * @author Glen Marchesani
27  * @author Xavier Hanin
28  * @author <a HREF="mailto:johnmshields@yahoo.com">John M. Shields</a>
29  */

30 public class ApacheURLLister {
31     //~ Static variables/initializers ------------------------------------------
32

33     private static final Pattern JavaDoc PATTERN =
34         Pattern.compile("<a[^>]*href=\"([^\"]*)\"[^>]*>(?:<[^>]+>)*?([^<>]+?)(?:<[^>]+>)*?</a>",
35             Pattern.CASE_INSENSITIVE);
36
37     //~ Methods ----------------------------------------------------------------
38

39     /**
40      * Returns a list of sub urls of the given url. The returned list is a list
41      * of URL.
42      *
43      * @param url The base URL from which to retrieve the listing.
44      *
45      * @return a list of sub urls of the given url.
46      *
47      * @throws IOException If an error occures retrieving the HTML.
48      */

49     public List JavaDoc listAll(URL JavaDoc url) throws IOException JavaDoc {
50         return retrieveListing(url, true, true);
51     }
52
53
54     /**
55      * Returns a list of sub 'directories' of the given url. The returned list
56      * is a list of URL.
57      *
58      * @param url The base URL from which to retrieve the listing.
59      *
60      * @return a list of sub 'directories' of the given url.
61      *
62      * @throws IOException If an error occures retrieving the HTML.
63      */

64     public List JavaDoc listDirectories(URL JavaDoc url) throws IOException JavaDoc {
65         return retrieveListing(url, false, true);
66     }
67
68
69     /**
70      * Returns a list of sub 'files' (in opposition to directories) of the
71      * given url. The returned list is a list of URL.
72      *
73      * @param url The base URL from which to retrieve the listing.
74      *
75      * @return a list of sub 'files' of the given url.
76      *
77      * @throws IOException If an error occures retrieving the HTML.
78      */

79     public List JavaDoc listFiles(URL JavaDoc url) throws IOException JavaDoc {
80         return retrieveListing(url, true, false);
81     }
82
83
84     /**
85      * Retrieves a {@link List} of {@link URL}s corresponding to the files
86      * and/or directories found at the supplied base URL.
87      *
88      * @param url The base URL from which to retrieve the listing.
89      * @param includeFiles If true include files in the returned list.
90      * @param includeDirectories If true include directories in the returned
91      * list.
92      *
93      * @return A {@link List} of {@link URL}s.
94      *
95      * @throws IOException If an error occures retrieving the HTML.
96      */

97     public List JavaDoc retrieveListing(URL JavaDoc url, boolean includeFiles,
98         boolean includeDirectories) throws IOException JavaDoc {
99         List JavaDoc urlList = new ArrayList JavaDoc();
100
101         // add trailing slash for relative urls
102
if (!url.getPath().endsWith("/") && !url.getPath().endsWith(".html")) {
103             url = new URL JavaDoc(url.getProtocol(), url.getHost(), url.getPort(),
104                     url.getPath() + "/");
105         }
106
107         BufferedReader JavaDoc r =
108             new BufferedReader JavaDoc(new InputStreamReader JavaDoc(URLHandlerRegistry.getDefault()
109                                                                        .openStream(url)));
110
111         String JavaDoc htmlText = FileUtil.readEntirely(r);
112
113         Matcher JavaDoc matcher = PATTERN.matcher(htmlText);
114
115         while (matcher.find()) {
116             // get the href text and the displayed text
117
String JavaDoc href = matcher.group(1);
118             String JavaDoc text = matcher.group(2);
119             
120             if ((href == null) || (text == null)) {
121                 // the groups were not found (shouldn't happen, really)
122
continue;
123             }
124             
125             text = text.trim();
126
127             // absolute href: convert to relative one
128
if (href.startsWith("/")) {
129                 int slashIndex = href.substring(0, href.length() - 1).lastIndexOf('/');
130                 href= href.substring(slashIndex+1);
131             }
132
133             // exclude those where they do not match
134
// href will never be truncated, text may be truncated by apache
135
// may have a '.' from either the extension (.jar) or "..&gt;"
136
int dotIndex = text.indexOf('.');
137
138             if ( ((dotIndex != -1) && !href.startsWith(text.substring(0, dotIndex)))
139                 || ((dotIndex == -1) && !href.equalsIgnoreCase(text)) ) {
140                 // the href and the text do not "match"
141
continue;
142             }
143
144             boolean directory = href.endsWith("/");
145
146             if ((directory && includeDirectories)
147                 || (!directory && includeFiles)) {
148                 URL JavaDoc child = new URL JavaDoc(url, href);
149                 urlList.add(child);
150                 Message.debug("ApacheURLLister found URL=[" + child + "].");
151             }
152         }
153
154         return urlList;
155     }
156 }
157
Popular Tags