KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > nava > informa > parsers > RSS_2_0_Parser


1 //
2
// Informa -- RSS Library for Java
3
// Copyright (c) 2002 by Niko Schmuck
4
//
5
// Niko Schmuck
6
// http://sourceforge.net/projects/informa
7
// mailto:niko_schmuck@users.sourceforge.net
8
//
9
// This library is free software.
10
//
11
// You may redistribute it and/or modify it under the terms of the GNU
12
// Lesser General Public License as published by the Free Software Foundation.
13
//
14
// Version 2.1 of the license should be included with this distribution in
15
// the file LICENSE. If the license is not included with this distribution,
16
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
17
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
18
// MA 02139 USA.
19
//
20
// This library is distributed in the hope that it will be useful,
21
// but WITHOUT ANY WARRANTY; without even the implied waranty of
22
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
// Lesser General Public License for more details.
24
//
25

26 package de.nava.informa.parsers;
27
28 import java.net.URL JavaDoc;
29 import java.util.*;
30
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.jdom.Attribute;
34 import org.jdom.Element;
35 import org.jdom.Namespace;
36
37 import de.nava.informa.core.*;
38 import de.nava.informa.utils.ParserUtils;
39 import de.nava.informa.impl.basic.ChannelBuilder;
40
41 /**
42  * Parser which reads in document instances according to the RSS 2.0
43  * specification and generates a news channel object.
44  *
45  * @author Anthony Eden, Niko Schmuck
46  */

47 class RSS_2_0_Parser {
48
49   private static Log logger = LogFactory.getLog(RSS_2_0_Parser.class);
50
51     private static CategoryIF getCategoryList(CategoryIF parent, String JavaDoc title, Hashtable children) {
52         // Assuming category hierarchy for each category element
53
// is already mapped out into Hashtable tree; Hense the children Hashtable
54

55         // create channel builder to help create CategoryIF objects
56
ChannelBuilder builder = new ChannelBuilder();
57
58         // create current CategoryIF object; Parent may be null if at top level
59
CategoryIF cat = builder.createCategory(parent, title);
60         // iterate off list of keys from children list
61
Enumeration itChild = children.keys();
62         while (itChild.hasMoreElements()) {
63             String JavaDoc childKey = (String JavaDoc)itChild.nextElement();
64             // don't need to keep track of return CategoryIF since it will be added as child of another instance
65
getCategoryList(cat, childKey, (Hashtable)children.get(childKey));
66         }
67         return cat;
68     }
69
70   static ChannelIF parse(ChannelBuilderIF cBuilder, Element root)
71     throws ParseException {
72     if (cBuilder == null) {
73       throw new RuntimeException JavaDoc(
74         "Without builder no channel can " + "be created.");
75     }
76     Date dateParsed = new Date();
77     logger.debug("start parsing.");
78
79     Namespace defNS = ParserUtils.getDefaultNS(root);
80     if (defNS == null) {
81       defNS = Namespace.NO_NAMESPACE;
82       logger.info("No default namespace found.");
83     }
84     Namespace dcNS = ParserUtils.getNamespace(root, "dc");
85     // fall back to default name space
86
if (dcNS == null) {
87       dcNS = defNS;
88     }
89
90     // Get the channel element (only one occurs)
91
Element channel = root.getChild("channel", defNS);
92     if (channel == null) {
93       logger.warn("Channel element could not be retrieved from feed.");
94       throw new ParseException("No channel element found in feed.");
95     }
96
97     // --- read in channel information
98

99     // 1 title element
100
ChannelIF chnl =
101       cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS));
102
103     // set channel format
104
chnl.setFormat(ChannelFormat.RSS_2_0);
105
106     // 1 description element
107
chnl.setDescription(channel.getChildTextTrim("description", defNS));
108
109     // 1 link element
110
chnl.setSite(ParserUtils.getURL(channel.getChildTextTrim("link", defNS)));
111
112     // 1 language element
113
chnl.setLanguage(channel.getChildTextTrim("language", defNS));
114
115     // 1..n item elements
116
List items = channel.getChildren("item", defNS);
117     Iterator i = items.iterator();
118     while (i.hasNext()) {
119       Element item = (Element) i.next();
120
121       // get title element
122
Element elTitle = item.getChild("title", defNS);
123       String JavaDoc strTitle = "<No Title>";
124       if (elTitle != null) {
125         strTitle = elTitle.getTextTrim();
126       }
127       if (logger.isDebugEnabled()) {
128         logger.debug("Item element found (" + strTitle + ").");
129       }
130
131       // get link element
132
Element elLink = item.getChild("link", defNS);
133       String JavaDoc strLink = "";
134       if (elLink != null) {
135         strLink = elLink.getTextTrim();
136       }
137
138       // get description element
139
Element elDesc = item.getChild("description", defNS);
140       String JavaDoc strDesc = "";
141       if (elDesc != null) {
142         strDesc = elDesc.getTextTrim();
143       }
144
145       // generate new RSS item (link to article)
146
ItemIF rssItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
147                                            ParserUtils.getURL(strLink));
148
149       // get subject element
150
Element elSubject = item.getChild("subject", defNS);
151       if (elSubject == null) {
152         // fallback mechanism: get dc:subject element
153
elSubject = item.getChild("subject", dcNS);
154       }
155       if (elSubject != null) {
156         rssItem.setSubject(elSubject.getTextTrim());
157       }
158
159       // get category list
160
// get list of <category> elements
161
List listCategory = item.getChildren("category", defNS);
162       if (listCategory.size() < 1) {
163         // fallback mechanism: get dc:category element
164
listCategory = item.getChildren("category", dcNS);
165       }
166       if (listCategory.size() > 0) {
167         Hashtable catTable = new Hashtable();
168
169         // for each category, parse hierarchy
170
Iterator itCat = listCategory.iterator();
171         while (itCat.hasNext()) {
172           Hashtable currTable = catTable;
173           Element elCategory = (Element)itCat.next();
174           // get contents of category element
175
String JavaDoc [] titles = elCategory.getTextNormalize().split("/");
176           for (int x=0; x<titles.length; x++) {
177             // tokenize category string to extract out hierarchy
178
if (currTable.containsKey(titles[x]) == false) {
179               // if token does not exist in current map, add it with child Hashtable
180
currTable.put(titles[x], new Hashtable());
181             }
182             // reset current Hashtable to child's Hashtable then iterate to next token
183
currTable = (Hashtable)currTable.get(titles[x]);
184           }
185         }
186         ArrayList catList = new ArrayList();
187         // transform cat list & hierarchy into list of CategoryIF elements
188
Enumeration enumCategories = catTable.keys();
189         while (enumCategories.hasMoreElements()) {
190           String JavaDoc key = (String JavaDoc)enumCategories.nextElement();
191           // build category list: getCategoryList(parent, title, children)
192
CategoryIF cat = getCategoryList(null, key, (Hashtable)catTable.get(key));
193           catList.add(cat);
194         }
195         if (catList.size() > 0) {
196           // if categories were actually created, then add list to item node
197
rssItem.setCategories(catList);
198         }
199       }
200
201       // get publication date
202
Element elDate = item.getChild("pubDate", defNS);
203       if (elDate == null) {
204         // fallback mechanism: get dc:date element
205
elDate = item.getChild("date", dcNS);
206       }
207       if (elDate != null) {
208         rssItem.setDate(ParserUtils.getDate(elDate.getTextTrim()));
209       }
210
211       rssItem.setFound(dateParsed);
212
213       // get Author element
214
Element elAuthor = item.getChild("author", defNS);
215       if (elAuthor == null) {
216         // fallback mechanism: get dc:creator element
217
elAuthor = item.getChild("creator", dcNS);
218       }
219       if (elAuthor != null)
220         rssItem.setCreator(elAuthor.getTextTrim());
221
222       // get Comments element
223
Element elComments = item.getChild("comments", defNS);
224       String JavaDoc strComments = "";
225       if (elComments != null) {
226         strComments = elComments.getTextTrim();
227       }
228       rssItem.setComments(ParserUtils.getURL(strComments));
229
230       // get guid element
231
Element elGuid = item.getChild("guid", defNS);
232       if (elGuid != null) {
233         String JavaDoc guidUrl = elGuid.getTextTrim();
234         if (guidUrl != null) {
235           boolean permaLink = true;
236           Attribute permaLinkAttribute = elGuid.getAttribute("isPermaLink", defNS);
237           if (permaLinkAttribute != null) {
238             String JavaDoc permaLinkStr = permaLinkAttribute.getValue();
239             if (permaLinkStr != null) {
240               permaLink = Boolean.valueOf(permaLinkStr).booleanValue();
241             }
242           }
243           ItemGuidIF itemGuid =
244             cBuilder.createItemGuid(rssItem, guidUrl, permaLink);
245           rssItem.setGuid(itemGuid);
246         }
247       }
248
249       // get source element
250
Element elSource = item.getChild("source", defNS);
251       if (elSource != null) {
252         String JavaDoc sourceName = elSource.getTextTrim();
253         Attribute sourceAttribute = elSource.getAttribute("url", defNS);
254         if (sourceAttribute != null) {
255           String JavaDoc sourceLocation = sourceAttribute.getValue().trim();
256           ItemSourceIF itemSource = cBuilder.createItemSource(rssItem, sourceName,
257                                                               sourceLocation, null);
258           rssItem.setSource(itemSource);
259         }
260       }
261
262       // get enclosure element
263
Element elEnclosure = item.getChild("enclosure", defNS);
264       if (elEnclosure != null) {
265         URL JavaDoc location = null;
266         String JavaDoc type = null;
267         int length = -1;
268         Attribute urlAttribute = elEnclosure.getAttribute("url", defNS);
269         if (urlAttribute != null) {
270           location = ParserUtils.getURL(urlAttribute.getValue().trim());
271         }
272         Attribute typeAttribute = elEnclosure.getAttribute("type", defNS);
273         if (typeAttribute != null) {
274           type = typeAttribute.getValue().trim();
275         }
276         Attribute lengthAttribute = elEnclosure.getAttribute("length", defNS);
277         if (lengthAttribute != null) {
278           try {
279             length = Integer.parseInt(lengthAttribute.getValue().trim());
280           } catch (NumberFormatException JavaDoc e) {
281             logger.warn(e);
282           }
283         }
284         ItemEnclosureIF itemEnclosure =
285           cBuilder.createItemEnclosure(rssItem, location, type, length);
286         rssItem.setEnclosure(itemEnclosure);
287       }
288     }
289
290     // 0..1 image element
291
Element image = channel.getChild("image", defNS);
292     if (image != null) {
293       ImageIF rssImage =
294         cBuilder.createImage(
295           image.getChildTextTrim("title", defNS),
296           ParserUtils.getURL(image.getChildTextTrim("url", defNS)),
297           ParserUtils.getURL(image.getChildTextTrim("link", defNS)));
298       Element imgWidth = image.getChild("width", defNS);
299       if (imgWidth != null) {
300         try {
301           rssImage.setWidth(Integer.parseInt(imgWidth.getTextTrim()));
302         } catch (NumberFormatException JavaDoc e) {
303           logger.warn("Error parsing width: " + e.getMessage());
304         }
305       }
306       Element imgHeight = image.getChild("height", defNS);
307       if (imgHeight != null) {
308         try {
309           rssImage.setHeight(Integer.parseInt(imgHeight.getTextTrim()));
310         } catch (NumberFormatException JavaDoc e) {
311           logger.warn("Error parsing height: " + e.getMessage());
312         }
313       }
314       Element imgDescr = image.getChild("description", defNS);
315       if (imgDescr != null) {
316         rssImage.setDescription(imgDescr.getTextTrim());
317       }
318       chnl.setImage(rssImage);
319     }
320
321     // 0..1 textinput element
322
Element txtinp = channel.getChild("textinput", defNS);
323     if (txtinp != null) {
324       TextInputIF rssTextInput =
325         cBuilder.createTextInput(
326           txtinp.getChildTextTrim("title", defNS),
327           txtinp.getChildTextTrim("description", defNS),
328           txtinp.getChildTextTrim("name", defNS),
329           ParserUtils.getURL(txtinp.getChildTextTrim("link", defNS)));
330       chnl.setTextInput(rssTextInput);
331     }
332
333     // 0..1 copyright element
334
Element copyright = channel.getChild("copyright", defNS);
335     if (copyright != null) {
336       chnl.setCopyright(copyright.getTextTrim());
337     }
338
339     // 0..1 Rating element
340
Element rating = channel.getChild("rating", defNS);
341     if (rating != null) {
342       chnl.setRating(rating.getTextTrim());
343     }
344
345     // 0..1 Docs element
346
Element docs = channel.getChild("docs", defNS);
347     if (docs != null) {
348       chnl.setDocs(docs.getTextTrim());
349     }
350
351     // 0..1 Generator element
352
Element generator = channel.getChild("generator", defNS);
353     if (generator != null) {
354       chnl.setGenerator(generator.getTextTrim());
355     }
356
357     // 0..1 ttl element
358
Element ttl = channel.getChild("ttl", defNS);
359     if (ttl != null) {
360       chnl.setTtl(Integer.parseInt(ttl.getTextTrim()));
361     }
362
363     // 0..1 pubDate element
364
Element pubDate = channel.getChild("pubDate", defNS);
365     if (pubDate != null) {
366       chnl.setPubDate(ParserUtils.getDate(pubDate.getTextTrim()));
367     }
368
369     // 0..1 lastBuildDate element
370
Element lastBuildDate = channel.getChild("lastBuildDate", defNS);
371     if (lastBuildDate != null) {
372       chnl.setLastBuildDate(ParserUtils.getDate(lastBuildDate.getTextTrim()));
373     }
374
375     // get category list
376
// get list of <category> elements
377
List listCategory = channel.getChildren("category", defNS);
378     if (listCategory.size() < 1) {
379       // fallback mechanism: get dc:category element
380
listCategory = channel.getChildren("category", dcNS);
381     }
382     if (listCategory.size() > 0) {
383       Hashtable catTable = new Hashtable();
384            // for each category, parse hierarchy
385
Iterator itCat = listCategory.iterator();
386       while (itCat.hasNext()) {
387         Hashtable currTable = catTable;
388         Element elCategory = (Element)itCat.next();
389         // get contents of category element
390
String JavaDoc [] titles = elCategory.getTextNormalize().split("/");
391         for (int x=0; x<titles.length; x++) {
392           // tokenize category string to extract out hierarchy
393
if (currTable.containsKey(titles[x]) == false) {
394             // if token does not exist in current map, add it with child Hashtable
395
currTable.put(titles[x], new Hashtable());
396           }
397           // reset current Hashtable to child's Hashtable then iterate to next token
398
currTable = (Hashtable)currTable.get(titles[x]);
399         }
400       }
401       ArrayList catList = new ArrayList();
402       // transform cat list & hierarchy into list of CategoryIF elements
403
Enumeration enumCategories = catTable.keys();
404       while (enumCategories.hasMoreElements()) {
405         String JavaDoc key = (String JavaDoc)enumCategories.nextElement();
406         // build category list: getCategoryList(parent, title, children)
407
CategoryIF cat = getCategoryList(null, key, (Hashtable)catTable.get(key));
408         catList.add(cat);
409       }
410       if (catList.size() > 0) {
411         // if categories were actually created, then add list to item node
412
chnl.setCategories(catList);
413       }
414     }
415
416     // 0..1 managingEditor element
417
Element managingEditor = channel.getChild("managingEditor", defNS);
418     if (managingEditor != null) {
419       chnl.setCreator(managingEditor.getTextTrim());
420     }
421
422     // 0..1 webMaster element
423
Element webMaster = channel.getChild("webMaster", defNS);
424     if (webMaster != null) {
425       chnl.setPublisher(webMaster.getTextTrim());
426     }
427
428     // 0..1 cloud element
429
Element cloud = channel.getChild("cloud", defNS);
430     if (cloud != null) {
431       String JavaDoc _port = cloud.getAttributeValue("port", defNS);
432       int port = -1;
433       if (_port != null) {
434         try {
435           port = Integer.parseInt(_port);
436         } catch (NumberFormatException JavaDoc e) {
437           logger.warn(e);
438         }
439       }
440       chnl.setCloud(cBuilder.createCloud(cloud.getAttributeValue("domain", defNS),
441                                          port,
442                                          cloud.getAttributeValue("path", defNS),
443                                          cloud.getAttributeValue("registerProcedure", defNS),
444                                          cloud.getAttributeValue("protocol", defNS)));
445     }
446
447     chnl.setLastUpdated(dateParsed);
448
449     // 0..1 skipHours element
450
// 0..1 skipDays element
451

452     return chnl;
453   }
454
455 }
456
Popular Tags