KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > web > rss > RssHunter


1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  * GNU Library General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  */

16 package web.rss;
17
18 import java.io.BufferedReader JavaDoc;
19 import java.io.BufferedWriter JavaDoc;
20 import java.io.ByteArrayInputStream JavaDoc;
21 import java.io.File JavaDoc;
22 import java.io.FileNotFoundException JavaDoc;
23 import java.io.FileReader JavaDoc;
24 import java.io.FileWriter JavaDoc;
25 import java.io.FilenameFilter JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.InputStream JavaDoc;
28 import java.io.Reader JavaDoc;
29 import java.io.StringReader JavaDoc;
30 import java.text.MessageFormat JavaDoc;
31 import java.text.ParseException JavaDoc;
32 import java.util.Enumeration JavaDoc;
33 import java.util.Hashtable JavaDoc;
34 import java.util.Properties JavaDoc;
35
36 import javax.xml.parsers.DocumentBuilder JavaDoc;
37 import javax.xml.parsers.DocumentBuilderFactory JavaDoc;
38
39 import org.apache.commons.digester.Digester;
40 import org.apache.commons.httpclient.HttpClient;
41 import org.apache.commons.httpclient.HttpStatus;
42 import org.apache.commons.httpclient.methods.GetMethod;
43 import org.apache.commons.logging.Log;
44 import org.apache.commons.logging.LogFactory;
45 import org.w3c.dom.Document JavaDoc;
46 import org.w3c.dom.Element JavaDoc;
47
48 /**
49  * 用于网站摘要信息获取的类
50  * 使用方法:
51  * Channel channel = RssHunter.parse("http://www.javayou.com/blog/rss1.jsp");
52  * @author Winter Lau
53  */

54 public abstract class RssHunter {
55     
56     static Log log;
57
58     static Hashtable JavaDoc hunters;
59     static{
60         log = LogFactory.getLog(RssHunter.class);
61         hunters = new Hashtable JavaDoc();
62         InputStream JavaDoc in = RssHunter.class.getResourceAsStream("rss.properties");
63         if(in!=null){
64             Properties JavaDoc props = new Properties JavaDoc();
65             try{
66                 props.load(in);
67             }catch(IOException JavaDoc e){
68                 log.error("load res.properties failed.", e);
69             }finally{
70                 if(in!=null){
71                     try{
72                     in.close();
73                     }catch(Exception JavaDoc e){}
74                 }
75             }
76             Enumeration JavaDoc keys = props.keys();
77             while(keys.hasMoreElements()){
78                 String JavaDoc key = (String JavaDoc)keys.nextElement();
79                 try{
80                     RssHunter hunter = (RssHunter)Class.forName(props.getProperty(key)).newInstance();
81                     hunters.put(key.toLowerCase(), hunter);
82                 }catch(Exception JavaDoc e){
83                     log.error("initialize RssHunter failure.",e);
84                 }
85             }
86         }
87     }
88     
89     protected RssHunter(){
90     }
91     
92     /**
93      * 获取指定协议对应的摘要信息加载类
94      * @param protocol
95      * @return
96      */

97     protected static RssHunter getHunter(String JavaDoc protocol){
98         return (RssHunter)hunters.get(protocol.toLowerCase());
99     }
100     
101     /**
102      * 从URL获取信息内容并解析到Channel对象中
103      * @param url
104      * @return
105      * @throws Exception
106      */

107     public static Channel parse(String JavaDoc url) throws Exception JavaDoc{
108         String JavaDoc[] result = getContent(url);
109         String JavaDoc encoding = getEncoding(result[0].substring(0,50));
110         try{
111             RssHunter hunter = getHunter(result[1]);
112             return hunter.parse(new StringReader JavaDoc(result[0].trim()));
113         }catch(Exception JavaDoc e){
114             System.out.println("in parse mode="+result[1]+",url="+url);
115             throw e;
116         }
117     }
118     
119     /**
120      * 解析XML所用的编码方式
121      * @param xml
122      * @return
123      * @throws ParseException
124      */

125     protected static String JavaDoc getEncoding(String JavaDoc xml) throws ParseException JavaDoc{
126         MessageFormat JavaDoc mf = new MessageFormat JavaDoc("{1}encoding=\"{0}\"{2}");
127         try{
128             return (String JavaDoc)(mf.parse(xml)[0]);
129         }catch(Exception JavaDoc e){
130             return "UTF-8";
131         }
132     }
133     
134     /**
135      * 从地址URL中抓取摘要信息以及内容格式
136      * @param url
137      * @return 两个元素的字符串数据,第一个元素为摘要内容,第二个元素为内容格式,例如rss
138      * @throws Exception
139      */

140     protected static String JavaDoc[] getContent(String JavaDoc url) throws Exception JavaDoc
141     {
142         StringBuffer JavaDoc content = new StringBuffer JavaDoc();
143         StringBuffer JavaDoc mode = new StringBuffer JavaDoc();
144         long lastReload = load(url, content, mode);
145         if(needReload(lastReload) || content.length()==0 || (mode!=null && mode.length()==0))
146         {
147             HttpClient client = new HttpClient();
148             GetMethod get = new GetMethod(url);
149             get.addRequestHeader("user-agent","DLOG4J(http://www.javayou.com) RssHunter 1.0");
150             try{
151                 client.executeMethod(get);
152                 String JavaDoc charset = get.getResponseCharSet();
153                 if(get.getStatusCode() == HttpStatus.SC_OK){
154                     String JavaDoc ct = get.getResponseBodyAsString().trim();
155                     String JavaDoc encoding = getEncoding(ct.substring(0,50));
156                     DocumentBuilderFactory JavaDoc dbf = DocumentBuilderFactory.newInstance();
157                     DocumentBuilder JavaDoc db = dbf.newDocumentBuilder();
158                     Document JavaDoc doc = db.parse(new ByteArrayInputStream JavaDoc(ct.getBytes(charset)));
159                     String JavaDoc sMode = null;
160                     Element JavaDoc elem = doc.getDocumentElement();
161                     if("feed".equals(elem.getNodeName()))
162                         sMode = "atom";
163                     else
164                     if("rss".equals(elem.getNodeName()))
165                         sMode = "rss";
166                     else
167                     if("rdf:RDF".equals(elem.getNodeName()))
168                         sMode = "rdf";
169                     else
170                         throw new IllegalArgumentException JavaDoc(url);
171                     
172                     mode = new StringBuffer JavaDoc(sMode);
173                     
174                     ct = new String JavaDoc(ct.getBytes(charset),encoding);
175                     save(url, ct, sMode);
176                     content = new StringBuffer JavaDoc(ct);
177                 }
178             }catch(Exception JavaDoc e){
179                 log.error("fetch content from " + url +" failed.", e);
180             }finally{
181                 get.releaseConnection();
182             }
183         }
184         return new String JavaDoc[]{content.toString().trim(),mode.toString()};
185     }
186     /**
187      * 加载指定站点上次的保存的时间
188      * @param url 输入参数,站点的URL
189      * @param content 输出参数,保存信息的内容
190      * @return
191      * @throws IOException
192      */

193     protected static long load(String JavaDoc url, StringBuffer JavaDoc content, StringBuffer JavaDoc mode) throws IOException JavaDoc{
194         String JavaDoc path = getCachePath();
195         BufferedReader JavaDoc reader = null;
196         long lastModified = 0L;
197         try{
198             File JavaDoc f = new File JavaDoc(path);
199             if(f.exists()){
200                 final String JavaDoc pattern = Math.abs(url.hashCode()) + ".";
201                 File JavaDoc[] fs = f.listFiles(new FilenameFilter JavaDoc(){
202                     public boolean accept(File JavaDoc dir, String JavaDoc name) {
203                         return name.startsWith(pattern);
204                     }});
205                 if(fs.length>0){
206                     mode.append(fs[0].getName().substring(pattern.length()));
207                     lastModified = fs[0].lastModified();
208                     reader = new BufferedReader JavaDoc(new FileReader JavaDoc(fs[0]));
209                     String JavaDoc lineSep = System.getProperty("line.separator");
210                     StringBuffer JavaDoc tmpContent = new StringBuffer JavaDoc();
211                     do{
212                         String JavaDoc line = reader.readLine();
213                         if(line==null)
214                             break;
215                         tmpContent.append(line);
216                         tmpContent.append(lineSep);
217                     }while(true);
218                     content.append(tmpContent.toString().trim());
219                 }
220             }
221         }catch(FileNotFoundException JavaDoc e){
222         }finally{
223             if(reader!=null)
224                 reader.close();
225         }
226         return lastModified;
227     }
228     /**
229      * 保存某个站点的最新信息
230      * @param url 输入参数,站点URL
231      * @param content 输入参数,站点摘要信息
232      * @throws IOException
233      */

234     protected static void save(String JavaDoc url, String JavaDoc content, String JavaDoc mode) throws IOException JavaDoc{
235         StringBuffer JavaDoc path = new StringBuffer JavaDoc(getCachePath());
236         path.append(Math.abs(url.hashCode()));
237         path.append('.');
238         path.append(mode);
239         BufferedWriter JavaDoc writer = null;
240         try{
241             File JavaDoc f = new File JavaDoc(path.toString());
242             if(!f.getParentFile().exists())
243                 f.getParentFile().mkdirs();
244             writer = new BufferedWriter JavaDoc(new FileWriter JavaDoc(f));
245             writer.write(content);
246         }finally{
247             if(writer!=null)
248                 writer.close();
249         }
250     }
251     /**
252      * 获取存放缓冲文件所在的目录,默认为临时目录下的dlog4j_cache子目录
253      * @return
254      */

255     protected static String JavaDoc getCachePath(){
256         String JavaDoc tmpDir = System.getProperty("java.io.tmpdir");
257         if(!tmpDir.endsWith(File.separator))
258             tmpDir += File.separator;
259         tmpDir += "dlog4j_cache";
260         tmpDir += File.separator;
261         return tmpDir;
262     }
263     
264     /**
265      * 默认的重新加载策略:上一次加载二十分钟后允许重新加载
266      * 子类可覆盖该方法进行策略的重新定义
267      * @param lastReload
268      * @return
269      */

270     protected static boolean needReload(long lastReload){
271         long currentTime = System.currentTimeMillis();
272         return (currentTime - lastReload) > 3600000;
273     }
274         
275     /**
276      * 子类负责将摘要内容解析成条目
277      * @param content
278      * @return
279      * @throws Exception
280      */

281     protected abstract Channel parse(Reader JavaDoc content) throws Exception JavaDoc;
282     
283     /**
284      * 获取XML解析器
285      * @param channel
286      * @return
287      */

288     protected Digester getDigester(){
289         Digester digester = new Digester();
290         digester.push(new Channel());
291         digester.setNamespaceAware(true);
292         digester.setValidating(false);
293         return digester;
294     }
295
296     /**
297      * 获取XML解析器
298      * @param channel
299      * @return
300      */

301     protected Digester getDigester(Channel channel){
302         Digester digester = new Digester();
303         digester.push(channel);
304         digester.setNamespaceAware(true);
305         digester.setValidating(false);
306         return digester;
307     }
308     
309     public static void main(String JavaDoc[] args) throws Exception JavaDoc{
310         Channel site = parse(args[0]);
311         System.out.println("site.title:"+site.getTitle());
312         System.out.println("site.link:"+site.getLink());
313         System.out.println("site.description:"+site.getDescription());
314         System.out.println("============ ITEMS ============");
315         for(int i=0;i<site.getItems().size();i++){
316             Item log = (Item)site.getItems().get(i);
317             System.out.println("log.title:"+log.getTitle());
318             System.out.println("log.link:"+log.getLink());
319             System.out.println("log.description:"+log.getDescription());
320             System.out.println("-----------------------------------");
321         }
322     }
323     
324 }
325
Popular Tags