KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > nava > informa > utils > FormatDetector


1 //
2
// Informa -- RSS Library for Java
3
// Copyright (c) 2002 by Niko Schmuck
4
//
5
// Niko Schmuck
6
// http://sourceforge.net/projects/informa
7
// mailto:niko_schmuck@users.sourceforge.net
8
//
9
// This library is free software.
10
//
11
// You may redistribute it and/or modify it under the terms of the GNU
12
// Lesser General Public License as published by the Free Software Foundation.
13
//
14
// Version 2.1 of the license should be included with this distribution in
15
// the file LICENSE. If the license is not included with this distribution,
16
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
17
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
18
// MA 02139 USA.
19
//
20
// This library is distributed in the hope that it will be useful,
21
// but WITHOUT ANY WARRANTY; without even the implied waranty of
22
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
// Lesser General Public License for more details.
24
//
25

26
27 // $Id: FormatDetector.java,v 1.23 2003/10/05 02:03:26 mharhen Exp $
28

29 package de.nava.informa.utils;
30
31 import java.io.BufferedInputStream JavaDoc;
32 import java.io.InputStream JavaDoc;
33 import java.io.IOException JavaDoc;
34 import java.net.URL JavaDoc;
35
36 import org.apache.commons.logging.Log;
37 import org.apache.commons.logging.LogFactory;
38
39 import de.nava.informa.core.ChannelFormat;
40 import de.nava.informa.core.UnsupportedFormatException;
41
42 /**
43  * Utility class for analysing the news channel syntax and mapping to
44  * known format to ease further processing.
45  *
46  * @author Niko Schmuck (niko@nava.de)
47  */

48 public final class FormatDetector {
49
50   private static Log logger = LogFactory.getLog(FormatDetector.class);
51
52   private static final int NR_FIRST_BYTES = 2048;
53
54
55   /**
56    * Guess the format of the specified news channel. For performance
57    * reason it is wise to minimize the number of format guesses.
58    *
59    * @param url a url to the news channel.
60    * @return The news channel synatx format, currently only RSS 0.91
61    * ({@link de.nava.informa.core.ChannelFormat#RSS_0_91})
62    * and RSS/RDF 1.0
63    * ({@link de.nava.informa.core.ChannelFormat#RSS_1_0})
64    * are recognized.
65    * @throws UnsupportedFormatException in case a news channel format
66    * could not be guessed.
67    * @throws IOException if the given url cannot be read in.
68    */

69   public static ChannelFormat getFormat(URL JavaDoc url)
70     throws IOException JavaDoc, UnsupportedFormatException {
71
72     logger.info("Trying to retrieve stream from " + url);
73     BufferedInputStream JavaDoc in = new BufferedInputStream JavaDoc(url.openStream(),
74                                                      NR_FIRST_BYTES);
75      return getFormat(in);
76    }
77
78    /**
79     * Guess the format of the specified news channel. For performance
80     * reason it is wise to minimize the number of format guesses.
81     *
82     * @param in an InputStream to the news channel.
83     * @return The news channel synatx format, currently only RSS 0.91
84     * ({@link de.nava.informa.core.ChannelFormat#RSS_0_91})
85     * and RSS/RDF 1.0
86     * ({@link de.nava.informa.core.ChannelFormat#RSS_1_0})
87     * are recognized.
88     * @throws UnsupportedFormatException in case a news channel format
89     * could not be guessed.
90     * @throws IOException if the given url cannot be read in.
91     */

92    public static ChannelFormat getFormat(InputStream JavaDoc in)
93        throws IOException JavaDoc, UnsupportedFormatException {
94
95     byte[] b = new byte[NR_FIRST_BYTES];
96
97     int bytesRead = 0;
98     while (bytesRead < NR_FIRST_BYTES) {
99       int bytes = in.read(b, bytesRead, NR_FIRST_BYTES - bytesRead);
100       if (bytes == -1) break;
101       bytesRead += bytes;
102     }
103
104     String JavaDoc rootElement = getRootElement(b);
105     logger.debug("Detected [" + rootElement + "].");
106     if (rootElement.startsWith("rss")) {
107       if (rootElement.indexOf("0.91") > 0) {
108         logger.info("Channel uses RSS root element (Version 0.91).");
109         return ChannelFormat.RSS_0_91;
110       } else if (rootElement.indexOf("0.92") > 0) {
111         logger.info("Channel uses RSS root element (Version 0.92).");
112         // FIXME: should really return ChannelFormat.RSS_0_92
113
// when aware of all subtle differences.
114
return ChannelFormat.RSS_0_92;
115       } else if (rootElement.indexOf("0.93") > 0) {
116         logger.info("Channel uses RSS root element (Version 0.93).");
117         logger.warn("RSS 0.93 not fully supported yet, fall back to 0.92.");
118         // FIXME: should really return ChannelFormat.RSS_0_93
119
// when aware of all subtle differences.
120
return ChannelFormat.RSS_0_92;
121       } else if (rootElement.indexOf("0.94") > 0) {
122         logger.info("Channel uses RSS root element (Version 0.94).");
123         logger.warn("RSS 0.94 not fully supported yet, fall back to 0.92.");
124         // FIXME: should really return ChannelFormat.RSS_0_94
125
// when aware of all subtle differences.
126
return ChannelFormat.RSS_0_92;
127       } else if (rootElement.indexOf("2.0") > 0) {
128         logger.info("Channel uses RSS root element (Version 2.0).");
129         return ChannelFormat.RSS_2_0;
130       } else {
131         throw new UnsupportedFormatException("Unsupported RSS version [" +
132                                              rootElement + "].");
133       }
134     } else if (rootElement.indexOf("rdf") >= 0) {
135       logger.info("Channel uses RDF root element.");
136       return ChannelFormat.RSS_1_0;
137     } else {
138       throw new UnsupportedFormatException("Not able to parse document " +
139                                            "with root element [" +
140                                            rootElement + "].");
141     }
142   }
143
144   /**
145    * Gets the name of the root element and the attributes (inclusive
146    * namespace declarations).
147    */

148   private static final String JavaDoc getRootElement(byte[] b) {
149     String JavaDoc s = new String JavaDoc(b);
150     int startPos = 0;
151     int endPos = 0;
152     boolean inComment = false;
153     for (int i = 0; i < s.length(); i++) {
154       if (s.charAt(i) == '<' && Character.isLetter(s.charAt(i+1))
155           && !inComment) {
156         startPos = i + 1;
157         for (int j = i + 1; j < s.length(); j++) {
158           if (s.charAt(j) == '>') {
159             endPos = j;
160             break;
161           }
162         }
163         break;
164       }
165       else if (!inComment && s.charAt(i) == '<' && s.charAt(i+1) == '!'
166           && s.charAt(i+2) == '-' && s.charAt(i+3) == '-')
167         inComment = true;
168       else if (inComment && s.charAt(i) == '-' && s.charAt(i+1) == '-'
169           && s.charAt(i+2) == '>')
170         inComment = false;
171     } // for i
172
if (startPos >= 0 && endPos >= 0 && endPos > startPos) {
173       return s.substring(startPos, endPos);
174     } else {
175       throw new IllegalArgumentException JavaDoc("Unable to retrieve root " +
176                                          "element from " + s);
177     }
178   }
179
180 }
181
Popular Tags