KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > roller > util > LinkbackExtractor


1 package org.roller.util;
2
3 import java.io.BufferedReader JavaDoc;
4 import java.io.IOException JavaDoc;
5 import java.io.InputStream JavaDoc;
6 import java.io.InputStreamReader JavaDoc;
7 import java.io.StringReader JavaDoc;
8 import java.net.MalformedURLException JavaDoc;
9 import java.net.URL JavaDoc;
10 import java.util.Arrays JavaDoc;
11 import java.util.Iterator JavaDoc;
12 import java.util.List JavaDoc;
13
14 import javax.swing.text.MutableAttributeSet JavaDoc;
15 import javax.swing.text.html.HTML JavaDoc;
16 import javax.swing.text.html.HTMLEditorKit JavaDoc;
17 import javax.swing.text.html.HTML.Tag;
18 import javax.swing.text.html.HTMLEditorKit.Parser;
19 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
20
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23
24 import com.sun.syndication.feed.synd.SyndEntry;
25 import com.sun.syndication.feed.synd.SyndFeed;
26 import com.sun.syndication.io.FeedException;
27 import com.sun.syndication.io.SyndFeedInput;
28
29 /**
30  * Parses HTML file for referring linkback title and excerpt.
31  *
32  * @author David M Johnson
33  */

34 public class LinkbackExtractor
35 {
36     private static Log mLogger = LogFactory.getFactory().getInstance(
37                                               LinkbackExtractor.class);
38     private boolean mFound = false;
39     private String JavaDoc mTitle = "";
40     private String JavaDoc mRssLink = null;
41     private String JavaDoc mExcerpt = null;
42     private String JavaDoc mPermalink = null;
43     private int mStart = 0;
44     private int mEnd = 0;
45     private int mMaxExcerpt = 500; // characters
46
private String JavaDoc mRequestURL = null;
47     private String JavaDoc mRequestURLWWW = null;
48     private String JavaDoc mRefererURL;
49
50     //------------------------------------------------------------------------
51
/**
52      * Extract referring page title, excerpt, and permalink.
53      *
54      * @param refererUrl
55      * @param requestUrl
56      */

57     public LinkbackExtractor(String JavaDoc refererURL, String JavaDoc requestURL)
58             throws MalformedURLException JavaDoc, IOException JavaDoc
59     {
60         try
61         {
62             extractByParsingHtml(refererURL, requestURL);
63             if (mRssLink != null)
64             {
65                 extractByParsingRss(mRssLink, requestURL);
66             }
67         }
68         catch (Exception JavaDoc e)
69         {
70             if (mLogger.isDebugEnabled())
71             {
72                 mLogger.debug("Extracting linkback", e);
73             }
74         }
75     }
76
77     //------------------------------------------------------------------------
78
private void extractByParsingHtml(String JavaDoc refererURL, String JavaDoc requestURL)
79             throws MalformedURLException JavaDoc, IOException JavaDoc
80     {
81         URL JavaDoc url = new URL JavaDoc(refererURL);
82         InputStream JavaDoc is = url.openStream();
83
84         mRefererURL = refererURL;
85
86         if (requestURL.startsWith("http://www."))
87         {
88             mRequestURLWWW = requestURL;
89             mRequestURL = "http://" + mRequestURLWWW.substring(11);
90         }
91         else
92         {
93             mRequestURL = requestURL;
94             mRequestURLWWW = "http://www." + mRequestURL.substring(7);
95         }
96
97         // Trick gets Swing's HTML parser
98
Parser parser = (new HTMLEditorKit JavaDoc() {
99             public Parser getParser()
100             {
101                 return super.getParser();
102             }
103         }).getParser();
104
105         // Read HTML file into string
106
StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
107         InputStreamReader JavaDoc isr = new InputStreamReader JavaDoc(is);
108         BufferedReader JavaDoc br = new BufferedReader JavaDoc(isr);
109         try
110         {
111             String JavaDoc line = null;
112             while ((line = br.readLine()) != null)
113             {
114                 sb.append(line);
115             }
116         }
117         finally
118         {
119             br.close();
120         }
121
122         // Parse HTML string to find title and start and end position
123
// of the referring excerpt.
124
StringReader JavaDoc sr = new StringReader JavaDoc(sb.toString());
125         parser.parse(sr, new LinkbackCallback(), true);
126
127         if (mStart != 0 && mEnd != 0 && mEnd > mStart)
128         {
129             mExcerpt = sb.toString().substring(mStart, mEnd);
130             mExcerpt = Utilities.removeHTML(mExcerpt);
131
132             if (mExcerpt.length() > mMaxExcerpt)
133             {
134                 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
135             }
136         }
137
138         if (mTitle.startsWith(">") && mTitle.length() > 1)
139         {
140             mTitle = mTitle.substring(1);
141         }
142     }
143
144     //------------------------------------------------------------------------
145
private void extractByParsingRss(String JavaDoc rssLink, String JavaDoc requestURL)
146             throws IllegalArgumentException JavaDoc, MalformedURLException JavaDoc, FeedException, IOException JavaDoc
147     {
148         SyndFeedInput feedInput = new SyndFeedInput();
149         SyndFeed feed = feedInput.build(
150             new InputStreamReader JavaDoc(new URL JavaDoc(rssLink).openStream()));
151         Iterator JavaDoc itemIter = feed.getEntries().iterator();
152         String JavaDoc feedTitle = feed.getTitle();
153
154         int count = 0;
155
156         if (mLogger.isDebugEnabled())
157         {
158             mLogger.debug("Feed parsed, title: " + feedTitle);
159         }
160
161         while (itemIter.hasNext())
162         {
163             count++;
164             SyndEntry item = (SyndEntry) itemIter.next();
165             if (item.getDescription().getValue().indexOf(requestURL) != -1)
166             {
167                 mFound = true;
168                 mPermalink = item.getLink().toString();
169                 if (feedTitle != null && feedTitle.trim().length() > 0)
170                 {
171                     mTitle = feedTitle + ": " + item.getTitle();
172                 }
173                 else
174                 {
175                     mTitle = item.getTitle();
176                 }
177                 mExcerpt = item.getDescription().getValue();
178                 mExcerpt = Utilities.removeHTML(mExcerpt);
179                 if (mExcerpt.length() > mMaxExcerpt)
180                 {
181                     mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
182                 }
183                 break;
184             }
185         }
186
187         if (mLogger.isDebugEnabled())
188         {
189             mLogger.debug("Parsed " + count + " articles, found linkback="
190                     + mFound);
191         }
192     }
193
194     //------------------------------------------------------------------------
195
/**
196      * Returns the excerpt.
197      *
198      * @return String
199      */

200     public String JavaDoc getExcerpt()
201     {
202         return mExcerpt;
203     }
204
205     //------------------------------------------------------------------------
206
/**
207      * Returns the title.
208      *
209      * @return String
210      */

211     public String JavaDoc getTitle()
212     {
213         return mTitle;
214     }
215
216     //------------------------------------------------------------------------
217
/**
218      * Returns the permalink.
219      *
220      * @return String
221      */

222     public String JavaDoc getPermalink()
223     {
224         return mPermalink;
225     }
226
227     //------------------------------------------------------------------------
228
/**
229      * Sets the permalink.
230      *
231      * @param permalink
232      * The permalink to set
233      */

234     public void setPermalink(String JavaDoc permalink)
235     {
236         mPermalink = permalink;
237     }
238
239     /////////////////////////////////////////////////////////////////////////
240

241     /**
242      * Parser callback that finds title and excerpt. As we walk through the HTML
243      * tags, we keep track of the most recently encountered divider tag in the
244      * mStart field. Once we find the referring permalink, we set the mFound
245      * flag. After that, we look for the next divider tag and save it's position
246      * in the mEnd field.
247      */

248     private final class LinkbackCallback extends ParserCallback
249     {
250         // Dividers
251
private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
252                                           Tag.BLOCKQUOTE, Tag.P, Tag.LI,
253                                           Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
254                                           Tag.H2, Tag.H3, Tag.H4, Tag.H5,
255                                           Tag.H6 };
256
257         private List JavaDoc mList = Arrays.asList(mDivTags);
258
259         private Tag mCurrentTag = null;
260
261         /**
262          * Look for divider tags and for the permalink.
263          *
264          * @param tag
265          * HTML tag
266          * @param atts
267          * Attributes of that tag
268          * @param pos
269          * Tag's position in file
270          */

271         public void handleStartTag(Tag tag, MutableAttributeSet JavaDoc atts, int pos)
272         {
273             if (mList.contains(tag) && !mFound)
274             {
275                 mStart = pos;
276             }
277             else if (mList.contains(tag) && mFound && mEnd == 0)
278             {
279                 mEnd = pos;
280             }
281             else if (tag.equals(Tag.A))
282             {
283                 String JavaDoc href = (String JavaDoc) atts.getAttribute(HTML.Attribute.HREF);
284                 if (href == null)
285                     return;
286                 int hashPos = href.lastIndexOf('#');
287                 if (hashPos != -1)
288                 {
289                     href = href.substring(0, hashPos);
290                 }
291                 if (href != null
292                         && (href.equals(mRequestURL) || href
293                                 .equals(mRequestURLWWW)))
294                 {
295                     mFound = true;
296                 }
297                 else
298                 {
299                     /*
300                      * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
301                      * "+href); }
302                      */

303                 }
304             }
305             mCurrentTag = tag;
306         }
307
308         /**
309          * Needed to handle SPAN tag.
310          */

311         public void handleSimpleTag(Tag tag, MutableAttributeSet JavaDoc atts, int pos)
312         {
313             if (mList.contains(tag) && mFound && mEnd == 0)
314             {
315                 mEnd = pos;
316             }
317             else if (tag.equals(Tag.LINK))
318             {
319                 // Look out for RSS autodiscovery link
320
String JavaDoc title = (String JavaDoc) atts.getAttribute(HTML.Attribute.TITLE);
321                 String JavaDoc type = (String JavaDoc) atts.getAttribute(HTML.Attribute.TYPE);
322                 if (title != null && type != null
323                         && type.equals("application/rss+xml")
324                         && title.equals("RSS"))
325                 {
326                     mRssLink = (String JavaDoc) atts.getAttribute(HTML.Attribute.HREF);
327
328                     if (mLogger.isDebugEnabled())
329                     {
330                         mLogger.debug("Found RSS link " + mRssLink);
331                     }
332
333                     if (mRssLink.startsWith("/") && mRssLink.length() > 1)
334                     {
335                         try
336                         {
337                             URL JavaDoc url = new URL JavaDoc(mRefererURL);
338                             mRssLink = url.getProtocol() + "://"
339                                     + url.getHost() + ":" + url.getPort()
340                                     + mRssLink;
341                         }
342                         catch (MalformedURLException JavaDoc e)
343                         {
344                             mRssLink = null;
345                             if (mLogger.isDebugEnabled())
346                             {
347                                 mLogger.debug("Determining RSS URL", e);
348                             }
349                         }
350                     }
351                     else if (!mRssLink.startsWith("http"))
352                     {
353                         int slash = mRefererURL.lastIndexOf("/");
354                         if (slash != -1)
355                         {
356                             mRssLink = mRefererURL.substring(0, slash) + "/"
357                                     + mRssLink;
358                         }
359                     }
360                     if (mLogger.isDebugEnabled())
361                     {
362                         mLogger.debug("Qualified RSS link is " + mRssLink);
363                     }
364                 }
365             }
366         }
367
368         /**
369          * Stop at the very first divider tag after the permalink.
370          *
371          * @param tag
372          * End tag
373          * @param pos
374          * Position in HTML file
375          */

376         public void handleEndTag(Tag tag, int pos)
377         {
378             if (mList.contains(tag) && mFound && mEnd == 0)
379             {
380                 mEnd = pos;
381             }
382             else if (mList.contains(tag) && !mFound)
383             {
384                 mStart = pos;
385             }
386             else
387             {
388                 mCurrentTag = null;
389             }
390         }
391
392         /**
393          * Get the page title
394          */

395         public void handleText(char[] data, int pos)
396         {
397             if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
398             {
399                 String JavaDoc newText = new String JavaDoc(data);
400                 if (mTitle.length() < 50)
401                 {
402                     mTitle += newText;
403                 }
404             }
405         }
406     }
407 }
408
409
Popular Tags