KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > roller > util > LinkbackExtractor


1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. The ASF licenses this file to You
4 * under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License. For additional information regarding
15 * copyright in this work, please see the NOTICE file in the top level
16 * directory of this distribution.
17 */

18 package org.apache.roller.util;
19
20 import java.io.BufferedReader JavaDoc;
21 import java.io.IOException JavaDoc;
22 import java.io.InputStream JavaDoc;
23 import java.io.InputStreamReader JavaDoc;
24 import java.io.StringReader JavaDoc;
25 import java.net.MalformedURLException JavaDoc;
26 import java.net.URL JavaDoc;
27 import java.util.Arrays JavaDoc;
28 import java.util.Iterator JavaDoc;
29 import java.util.List JavaDoc;
30
31 import javax.swing.text.MutableAttributeSet JavaDoc;
32 import javax.swing.text.html.HTML JavaDoc;
33 import javax.swing.text.html.HTMLEditorKit JavaDoc;
34 import javax.swing.text.html.HTML.Tag;
35 import javax.swing.text.html.HTMLEditorKit.Parser;
36 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
37
38 import org.apache.commons.logging.Log;
39 import org.apache.commons.logging.LogFactory;
40
41 import com.sun.syndication.feed.synd.SyndEntry;
42 import com.sun.syndication.feed.synd.SyndFeed;
43 import com.sun.syndication.io.FeedException;
44 import com.sun.syndication.io.SyndFeedInput;
45
46 /**
47  * Parses HTML file for referring linkback title and excerpt.
48  *
49  * @author David M Johnson
50  */

51 public class LinkbackExtractor
52 {
53     private static Log mLogger = LogFactory.getFactory().getInstance(
54                                               LinkbackExtractor.class);
55     private boolean mFound = false;
56     private String JavaDoc mTitle = "";
57     private String JavaDoc mRssLink = null;
58     private String JavaDoc mExcerpt = null;
59     private String JavaDoc mPermalink = null;
60     private int mStart = 0;
61     private int mEnd = 0;
62     private int mMaxExcerpt = 500; // characters
63
private String JavaDoc mRequestURL = null;
64     private String JavaDoc mRequestURLWWW = null;
65     private String JavaDoc mRefererURL;
66
67     //------------------------------------------------------------------------
68
/**
69      * Extract referring page title, excerpt, and permalink.
70      *
71      * @param refererUrl
72      * @param requestUrl
73      */

74     public LinkbackExtractor(String JavaDoc refererURL, String JavaDoc requestURL)
75             throws MalformedURLException JavaDoc, IOException JavaDoc
76     {
77         try
78         {
79             extractByParsingHtml(refererURL, requestURL);
80             if (mRssLink != null)
81             {
82                 extractByParsingRss(mRssLink, requestURL);
83             }
84         }
85         catch (Exception JavaDoc e)
86         {
87             if (mLogger.isDebugEnabled())
88             {
89                 mLogger.debug("Extracting linkback", e);
90             }
91         }
92     }
93
94     //------------------------------------------------------------------------
95
private void extractByParsingHtml(String JavaDoc refererURL, String JavaDoc requestURL)
96             throws MalformedURLException JavaDoc, IOException JavaDoc
97     {
98         URL JavaDoc url = new URL JavaDoc(refererURL);
99         InputStream JavaDoc is = url.openStream();
100
101         mRefererURL = refererURL;
102
103         if (requestURL.startsWith("http://www."))
104         {
105             mRequestURLWWW = requestURL;
106             mRequestURL = "http://" + mRequestURLWWW.substring(11);
107         }
108         else
109         {
110             mRequestURL = requestURL;
111             mRequestURLWWW = "http://www." + mRequestURL.substring(7);
112         }
113
114         // Trick gets Swing's HTML parser
115
Parser parser = (new HTMLEditorKit JavaDoc() {
116             public Parser getParser()
117             {
118                 return super.getParser();
119             }
120         }).getParser();
121
122         // Read HTML file into string
123
StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
124         InputStreamReader JavaDoc isr = new InputStreamReader JavaDoc(is);
125         BufferedReader JavaDoc br = new BufferedReader JavaDoc(isr);
126         try
127         {
128             String JavaDoc line = null;
129             while ((line = br.readLine()) != null)
130             {
131                 sb.append(line);
132             }
133         }
134         finally
135         {
136             br.close();
137         }
138
139         // Parse HTML string to find title and start and end position
140
// of the referring excerpt.
141
StringReader JavaDoc sr = new StringReader JavaDoc(sb.toString());
142         parser.parse(sr, new LinkbackCallback(), true);
143
144         if (mStart != 0 && mEnd != 0 && mEnd > mStart)
145         {
146             mExcerpt = sb.toString().substring(mStart, mEnd);
147             mExcerpt = Utilities.removeHTML(mExcerpt);
148
149             if (mExcerpt.length() > mMaxExcerpt)
150             {
151                 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
152             }
153         }
154
155         if (mTitle.startsWith(">") && mTitle.length() > 1)
156         {
157             mTitle = mTitle.substring(1);
158         }
159     }
160
161     //------------------------------------------------------------------------
162
private void extractByParsingRss(String JavaDoc rssLink, String JavaDoc requestURL)
163             throws IllegalArgumentException JavaDoc, MalformedURLException JavaDoc, FeedException, IOException JavaDoc
164     {
165         SyndFeedInput feedInput = new SyndFeedInput();
166         SyndFeed feed = feedInput.build(
167             new InputStreamReader JavaDoc(new URL JavaDoc(rssLink).openStream()));
168         Iterator JavaDoc itemIter = feed.getEntries().iterator();
169         String JavaDoc feedTitle = feed.getTitle();
170
171         int count = 0;
172
173         if (mLogger.isDebugEnabled())
174         {
175             mLogger.debug("Feed parsed, title: " + feedTitle);
176         }
177
178         while (itemIter.hasNext())
179         {
180             count++;
181             SyndEntry item = (SyndEntry) itemIter.next();
182             if (item.getDescription().getValue().indexOf(requestURL) != -1)
183             {
184                 mFound = true;
185                 mPermalink = item.getLink().toString();
186                 if (feedTitle != null && feedTitle.trim().length() > 0)
187                 {
188                     mTitle = feedTitle + ": " + item.getTitle();
189                 }
190                 else
191                 {
192                     mTitle = item.getTitle();
193                 }
194                 mExcerpt = item.getDescription().getValue();
195                 mExcerpt = Utilities.removeHTML(mExcerpt);
196                 if (mExcerpt.length() > mMaxExcerpt)
197                 {
198                     mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
199                 }
200                 break;
201             }
202         }
203
204         if (mLogger.isDebugEnabled())
205         {
206             mLogger.debug("Parsed " + count + " articles, found linkback="
207                     + mFound);
208         }
209     }
210
211     //------------------------------------------------------------------------
212
/**
213      * Returns the excerpt.
214      *
215      * @return String
216      */

217     public String JavaDoc getExcerpt()
218     {
219         return mExcerpt;
220     }
221
222     //------------------------------------------------------------------------
223
/**
224      * Returns the title.
225      *
226      * @return String
227      */

228     public String JavaDoc getTitle()
229     {
230         return mTitle;
231     }
232
233     //------------------------------------------------------------------------
234
/**
235      * Returns the permalink.
236      *
237      * @return String
238      */

239     public String JavaDoc getPermalink()
240     {
241         return mPermalink;
242     }
243
244     //------------------------------------------------------------------------
245
/**
246      * Sets the permalink.
247      *
248      * @param permalink
249      * The permalink to set
250      */

251     public void setPermalink(String JavaDoc permalink)
252     {
253         mPermalink = permalink;
254     }
255
256     /////////////////////////////////////////////////////////////////////////
257

258     /**
259      * Parser callback that finds title and excerpt. As we walk through the HTML
260      * tags, we keep track of the most recently encountered divider tag in the
261      * mStart field. Once we find the referring permalink, we set the mFound
262      * flag. After that, we look for the next divider tag and save it's position
263      * in the mEnd field.
264      */

265     private final class LinkbackCallback extends ParserCallback
266     {
267         // Dividers
268
private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
269                                           Tag.BLOCKQUOTE, Tag.P, Tag.LI,
270                                           Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
271                                           Tag.H2, Tag.H3, Tag.H4, Tag.H5,
272                                           Tag.H6 };
273
274         private List JavaDoc mList = Arrays.asList(mDivTags);
275
276         private Tag mCurrentTag = null;
277
278         /**
279          * Look for divider tags and for the permalink.
280          *
281          * @param tag
282          * HTML tag
283          * @param atts
284          * Attributes of that tag
285          * @param pos
286          * Tag's position in file
287          */

288         public void handleStartTag(Tag tag, MutableAttributeSet JavaDoc atts, int pos)
289         {
290             if (mList.contains(tag) && !mFound)
291             {
292                 mStart = pos;
293             }
294             else if (mList.contains(tag) && mFound && mEnd == 0)
295             {
296                 mEnd = pos;
297             }
298             else if (tag.equals(Tag.A))
299             {
300                 String JavaDoc href = (String JavaDoc) atts.getAttribute(HTML.Attribute.HREF);
301                 if (href == null)
302                     return;
303                 int hashPos = href.lastIndexOf('#');
304                 if (hashPos != -1)
305                 {
306                     href = href.substring(0, hashPos);
307                 }
308                 if (href != null
309                         && (href.equals(mRequestURL) || href
310                                 .equals(mRequestURLWWW)))
311                 {
312                     mFound = true;
313                 }
314                 else
315                 {
316                     /*
317                      * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
318                      * "+href); }
319                      */

320                 }
321             }
322             mCurrentTag = tag;
323         }
324
325         /**
326          * Needed to handle SPAN tag.
327          */

328         public void handleSimpleTag(Tag tag, MutableAttributeSet JavaDoc atts, int pos)
329         {
330             if (mList.contains(tag) && mFound && mEnd == 0)
331             {
332                 mEnd = pos;
333             }
334             else if (tag.equals(Tag.LINK))
335             {
336                 // Look out for RSS autodiscovery link
337
String JavaDoc title = (String JavaDoc) atts.getAttribute(HTML.Attribute.TITLE);
338                 String JavaDoc type = (String JavaDoc) atts.getAttribute(HTML.Attribute.TYPE);
339                 if (title != null && type != null
340                         && type.equals("application/rss+xml")
341                         && title.equals("RSS"))
342                 {
343                     mRssLink = (String JavaDoc) atts.getAttribute(HTML.Attribute.HREF);
344
345                     if (mLogger.isDebugEnabled())
346                     {
347                         mLogger.debug("Found RSS link " + mRssLink);
348                     }
349
350                     if (mRssLink.startsWith("/") && mRssLink.length() > 1)
351                     {
352                         try
353                         {
354                             URL JavaDoc url = new URL JavaDoc(mRefererURL);
355                             mRssLink = url.getProtocol() + "://"
356                                     + url.getHost() + ":" + url.getPort()
357                                     + mRssLink;
358                         }
359                         catch (MalformedURLException JavaDoc e)
360                         {
361                             mRssLink = null;
362                             if (mLogger.isDebugEnabled())
363                             {
364                                 mLogger.debug("Determining RSS URL", e);
365                             }
366                         }
367                     }
368                     else if (!mRssLink.startsWith("http"))
369                     {
370                         int slash = mRefererURL.lastIndexOf("/");
371                         if (slash != -1)
372                         {
373                             mRssLink = mRefererURL.substring(0, slash) + "/"
374                                     + mRssLink;
375                         }
376                     }
377                     if (mLogger.isDebugEnabled())
378                     {
379                         mLogger.debug("Qualified RSS link is " + mRssLink);
380                     }
381                 }
382             }
383         }
384
385         /**
386          * Stop at the very first divider tag after the permalink.
387          *
388          * @param tag
389          * End tag
390          * @param pos
391          * Position in HTML file
392          */

393         public void handleEndTag(Tag tag, int pos)
394         {
395             if (mList.contains(tag) && mFound && mEnd == 0)
396             {
397                 mEnd = pos;
398             }
399             else if (mList.contains(tag) && !mFound)
400             {
401                 mStart = pos;
402             }
403             else
404             {
405                 mCurrentTag = null;
406             }
407         }
408
409         /**
410          * Get the page title
411          */

412         public void handleText(char[] data, int pos)
413         {
414             if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
415             {
416                 String JavaDoc newText = new String JavaDoc(data);
417                 if (mTitle.length() < 50)
418                 {
419                     mTitle += newText;
420                 }
421             }
422         }
423     }
424 }
425
426
Popular Tags