RssHunter


1   /*
2    *  This program is free software; you can redistribute it and/or modify
3    *  it under the terms of the GNU General Public License as published by
4    *  the Free Software Foundation; either version 2 of the License, or
5    *  (at your option) any later version.
6    *
7    *  This program is distributed in the hope that it will be useful,
8    *  but WITHOUT ANY WARRANTY; without even the implied warranty of
9    *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10   *  GNU Library General Public License for more details.
11   *
12   *  You should have received a copy of the GNU General Public License
13   *  along with this program; if not, write to the Free Software
14   *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15   */
16  package web.rss;
17  
18  import java.io.BufferedReader  ;
19  import java.io.BufferedWriter  ;
20  import java.io.ByteArrayInputStream  ;
21  import java.io.File  ;
22  import java.io.FileNotFoundException  ;
23  import java.io.FileReader  ;
24  import java.io.FileWriter  ;
25  import java.io.FilenameFilter  ;
26  import java.io.IOException  ;
27  import java.io.InputStream  ;
28  import java.io.Reader  ;
29  import java.io.StringReader  ;
30  import java.text.MessageFormat  ;
31  import java.text.ParseException  ;
32  import java.util.Enumeration  ;
33  import java.util.Hashtable  ;
34  import java.util.Properties  ;
35  
36  import javax.xml.parsers.DocumentBuilder  ;
37  import javax.xml.parsers.DocumentBuilderFactory  ;
38  
39  import org.apache.commons.digester.Digester;
40  import org.apache.commons.httpclient.HttpClient;
41  import org.apache.commons.httpclient.HttpStatus;
42  import org.apache.commons.httpclient.methods.GetMethod;
43  import org.apache.commons.logging.Log;
44  import org.apache.commons.logging.LogFactory;
45  import org.w3c.dom.Document  ;
46  import org.w3c.dom.Element  ;
47  
48  /**
49   * ������վժҪ��Ϣ��ȡ����
50   * ʹ�÷�����
51   * Channel channel = RssHunter.parse("http://www.javayou.com/blog/rss1.jsp");
52   * @author Winter Lau
53   */
54  public abstract class RssHunter {
55      
56      static Log log;
57  
58      static Hashtable   hunters;   
59      static{
60          log = LogFactory.getLog(RssHunter.class);
61          hunters = new Hashtable  ();
62          InputStream   in = RssHunter.class.getResourceAsStream("rss.properties");
63          if(in!=null){
64              Properties   props = new Properties  ();
65              try{
66                  props.load(in);
67              }catch(IOException   e){
68                  log.error("load res.properties failed.", e);
69              }finally{
70                  if(in!=null){
71                      try{
72                      in.close();
73                      }catch(Exception   e){}
74                  }
75              }
76              Enumeration   keys = props.keys();
77              while(keys.hasMoreElements()){
78                  String   key = (String  )keys.nextElement();
79                  try{
80                      RssHunter hunter = (RssHunter)Class.forName(props.getProperty(key)).newInstance();
81                      hunters.put(key.toLowerCase(), hunter);
82                  }catch(Exception   e){
83                      log.error("initialize RssHunter failure.",e);
84                  }
85              }
86          }
87      }
88      
89      protected RssHunter(){      
90      }
91      
92      /**
93       * ��ȡָ��Э���Ӧ��ժҪ��Ϣ������
94       * @param protocol
95       * @return
96       */
97      protected static RssHunter getHunter(String   protocol){
98          return (RssHunter)hunters.get(protocol.toLowerCase());
99      }
100     
101     /**
102      * ��URL��ȡ��Ϣ���ݲ�������Channel������
103      * @param url
104      * @return
105      * @throws Exception
106      */
107     public static Channel parse(String   url) throws Exception  {
108         String  [] result = getContent(url);      
109         String   encoding = getEncoding(result[0].substring(0,50));
110         try{
111             RssHunter hunter = getHunter(result[1]);
112             return hunter.parse(new StringReader  (result[0].trim()));
113         }catch(Exception   e){
114             System.out.println("in parse mode="+result[1]+",url="+url);
115             throw e;
116         }
117     }
118     
119     /**
120      * ����XML���õı��뷽ʽ
121      * @param xml
122      * @return
123      * @throws ParseException
124      */
125     protected static String   getEncoding(String   xml) throws ParseException  {
126         MessageFormat   mf = new MessageFormat  ("{1}encoding=\"{0}\"{2}");
127         try{
128             return (String  )(mf.parse(xml)[0]);
129         }catch(Exception   e){
130             return "UTF-8";
131         }
132     }
133     
134     /**
135      * �ӵ�ַURL��ץȡժҪ��Ϣ�Լ����ݸ�ʽ
136      * @param url
137      * @return ����Ԫ�ص��ַ������ݣ���һ��Ԫ��ΪժҪ���ݣ��ڶ���Ԫ��Ϊ���ݸ�ʽ������rss
138      * @throws Exception
139      */
140     protected static String  [] getContent(String   url) throws Exception  
141     {
142         StringBuffer   content = new StringBuffer  ();
143         StringBuffer   mode = new StringBuffer  ();
144         long lastReload = load(url, content, mode);
145         if(needReload(lastReload) || content.length()==0 || (mode!=null && mode.length()==0))
146         {
147             HttpClient client = new HttpClient();
148             GetMethod get = new GetMethod(url);
149             get.addRequestHeader("user-agent","DLOG4J(http://www.javayou.com) RssHunter 1.0");
150             try{
151                 client.executeMethod(get);
152                 String   charset = get.getResponseCharSet();
153                 if(get.getStatusCode() == HttpStatus.SC_OK){
154                     String   ct = get.getResponseBodyAsString().trim();
155                     String   encoding = getEncoding(ct.substring(0,50));
156                     DocumentBuilderFactory   dbf = DocumentBuilderFactory.newInstance();
157                     DocumentBuilder   db = dbf.newDocumentBuilder();
158                     Document   doc = db.parse(new ByteArrayInputStream  (ct.getBytes(charset)));
159                     String   sMode = null;
160                     Element   elem = doc.getDocumentElement();
161                     if("feed".equals(elem.getNodeName()))
162                         sMode = "atom";
163                     else
164                     if("rss".equals(elem.getNodeName()))
165                         sMode = "rss";
166                     else
167                     if("rdf:RDF".equals(elem.getNodeName()))
168                         sMode = "rdf";
169                     else
170                         throw new IllegalArgumentException  (url);                        
171                     
172                     mode = new StringBuffer  (sMode);
173                     
174                     ct = new String  (ct.getBytes(charset),encoding);
175                     save(url, ct, sMode);
176                     content = new StringBuffer  (ct);
177                 }           
178             }catch(Exception   e){
179                 log.error("fetch content from " + url +" failed.", e);
180             }finally{
181                 get.releaseConnection();
182             }
183         }
184         return new String  []{content.toString().trim(),mode.toString()};
185     }
186     /**
187      * ����ָ��վ���ϴεı����ʱ��
188      * @param url ���������վ���URL
189      * @param content ���������������Ϣ������
190      * @return
191      * @throws IOException
192      */
193     protected static long load(String   url, StringBuffer   content, StringBuffer   mode) throws IOException  {
194         String   path = getCachePath();
195         BufferedReader   reader = null;
196         long lastModified = 0L;
197         try{
198             File   f = new File  (path);
199             if(f.exists()){
200                 final String   pattern = Math.abs(url.hashCode()) + ".";
201                 File  [] fs = f.listFiles(new FilenameFilter  (){
202                     public boolean accept(File   dir, String   name) {
203                         return name.startsWith(pattern);
204                     }});
205                 if(fs.length>0){
206                     mode.append(fs[0].getName().substring(pattern.length()));
207                     lastModified = fs[0].lastModified();
208                     reader = new BufferedReader  (new FileReader  (fs[0]));
209                     String   lineSep = System.getProperty("line.separator");
210                     StringBuffer   tmpContent = new StringBuffer  ();
211                     do{
212                         String   line = reader.readLine();
213                         if(line==null)
214                             break;
215                         tmpContent.append(line);
216                         tmpContent.append(lineSep);
217                     }while(true);           
218                     content.append(tmpContent.toString().trim());
219                 }
220             }           
221         }catch(FileNotFoundException   e){
222         }finally{
223             if(reader!=null)
224                 reader.close();
225         }
226         return lastModified;
227     }
228     /**
229      * ����ĳ��վ���������Ϣ
230      * @param url ���������վ��URL
231      * @param content ���������վ��ժҪ��Ϣ
232      * @throws IOException
233      */
234     protected static void save(String   url, String   content, String   mode) throws IOException  {
235         StringBuffer   path = new StringBuffer  (getCachePath());
236         path.append(Math.abs(url.hashCode()));
237         path.append('.');
238         path.append(mode);
239         BufferedWriter   writer = null;
240         try{
241             File   f = new File  (path.toString());
242             if(!f.getParentFile().exists())
243                 f.getParentFile().mkdirs();
244             writer = new BufferedWriter  (new FileWriter  (f));
245             writer.write(content);
246         }finally{
247             if(writer!=null)
248                 writer.close();
249         }
250     }
251     /**
252      * ��ȡ��Ż����ļ����ڵ�Ŀ¼,Ĭ��Ϊ��ʱĿ¼�µ�dlog4j_cache��Ŀ¼
253      * @return
254      */
255     protected static String   getCachePath(){
256         String   tmpDir = System.getProperty("java.io.tmpdir");
257         if(!tmpDir.endsWith(File.separator))
258             tmpDir += File.separator;
259         tmpDir += "dlog4j_cache";
260         tmpDir += File.separator;
261         return tmpDir;
262     }
263     
264     /**
265      * Ĭ�ϵ����¼��ز��ԣ���һ�μ��ض�ʮ���Ӻ��������¼���
266      * ����ɸ��Ǹ÷������в��Ե����¶���
267      * @param lastReload
268      * @return
269      */
270     protected static boolean needReload(long lastReload){
271         long currentTime = System.currentTimeMillis();
272         return (currentTime - lastReload) > 3600000;
273     }
274         
275     /**
276      * ���ฺ��ժҪ���ݽ�������Ŀ
277      * @param content
278      * @return
279      * @throws Exception
280      */
281     protected abstract Channel parse(Reader   content) throws Exception  ;
282     
283     /**
284      * ��ȡXML������
285      * @param channel
286      * @return
287      */
288     protected Digester getDigester(){
289         Digester digester = new Digester();
290         digester.push(new Channel());
291         digester.setNamespaceAware(true);
292         digester.setValidating(false);        
293         return digester;
294     }
295 
296     /**
297      * ��ȡXML������
298      * @param channel
299      * @return
300      */
301     protected Digester getDigester(Channel channel){
302         Digester digester = new Digester();
303         digester.push(channel);
304         digester.setNamespaceAware(true);
305         digester.setValidating(false);        
306         return digester;
307     }
308     
309     public static void main(String  [] args) throws Exception  {
310         Channel site = parse(args[0]);
311         System.out.println("site.title:"+site.getTitle());
312         System.out.println("site.link:"+site.getLink());
313         System.out.println("site.description:"+site.getDescription());
314         System.out.println("============ ITEMS ============");
315         for(int i=0;i<site.getItems().size();i++){
316             Item log = (Item)site.getItems().get(i);
317             System.out.println("log.title:"+log.getTitle());
318             System.out.println("log.link:"+log.getLink());
319             System.out.println("log.description:"+log.getDescription());
320             System.out.println("-----------------------------------");
321         }
322     }
323     
324 }
325
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags