LinkbackExtractor


1   /*
2   * Licensed to the Apache Software Foundation (ASF) under one or more
3   *  contributor license agreements.  The ASF licenses this file to You
4   * under the Apache License, Version 2.0 (the "License"); you may not
5   * use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *     http://www.apache.org/licenses/LICENSE-2.0
9   *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.  For additional information regarding
15  * copyright in this work, please see the NOTICE file in the top level
16  * directory of this distribution.
17  */
18  package org.apache.roller.util;
19  
20  import java.io.BufferedReader  ;
21  import java.io.IOException  ;
22  import java.io.InputStream  ;
23  import java.io.InputStreamReader  ;
24  import java.io.StringReader  ;
25  import java.net.MalformedURLException  ;
26  import java.net.URL  ;
27  import java.util.Arrays  ;
28  import java.util.Iterator  ;
29  import java.util.List  ;
30  
31  import javax.swing.text.MutableAttributeSet  ;
32  import javax.swing.text.html.HTML  ;
33  import javax.swing.text.html.HTMLEditorKit  ;
34  import javax.swing.text.html.HTML.Tag;
35  import javax.swing.text.html.HTMLEditorKit.Parser;
36  import javax.swing.text.html.HTMLEditorKit.ParserCallback;
37  
38  import org.apache.commons.logging.Log;
39  import org.apache.commons.logging.LogFactory;
40  
41  import com.sun.syndication.feed.synd.SyndEntry;
42  import com.sun.syndication.feed.synd.SyndFeed;
43  import com.sun.syndication.io.FeedException;
44  import com.sun.syndication.io.SyndFeedInput;
45  
46  /**
47   * Parses HTML file for referring linkback title and excerpt.
48   * 
49   * @author David M Johnson
50   */
51  public class LinkbackExtractor
52  {
53      private static Log mLogger        = LogFactory.getFactory().getInstance(
54                                                LinkbackExtractor.class);
55      private boolean    mFound         = false;
56      private String       mTitle         = "";
57      private String       mRssLink       = null;
58      private String       mExcerpt       = null;
59      private String       mPermalink     = null;
60      private int        mStart         = 0;
61      private int        mEnd           = 0;
62      private int        mMaxExcerpt    = 500;                           // characters
63      private String       mRequestURL    = null;
64      private String       mRequestURLWWW = null;
65      private String       mRefererURL;
66  
67      //------------------------------------------------------------------------
68      /**
69       * Extract referring page title, excerpt, and permalink.
70       * 
71       * @param refererUrl
72       * @param requestUrl
73       */
74      public LinkbackExtractor(String   refererURL, String   requestURL)
75              throws MalformedURLException  , IOException  
76      {
77          try
78          {
79              extractByParsingHtml(refererURL, requestURL);
80              if (mRssLink != null)
81              {
82                  extractByParsingRss(mRssLink, requestURL);
83              }
84          }
85          catch (Exception   e)
86          {
87              if (mLogger.isDebugEnabled())
88              {
89                  mLogger.debug("Extracting linkback", e);
90              }
91          }
92      }
93  
94      //------------------------------------------------------------------------
95      private void extractByParsingHtml(String   refererURL, String   requestURL)
96              throws MalformedURLException  , IOException  
97      {
98          URL   url = new URL  (refererURL);
99          InputStream   is = url.openStream();
100 
101         mRefererURL = refererURL;
102 
103         if (requestURL.startsWith("http://www."))
104         {
105             mRequestURLWWW = requestURL;
106             mRequestURL = "http://" + mRequestURLWWW.substring(11);
107         }
108         else
109         {
110             mRequestURL = requestURL;
111             mRequestURLWWW = "http://www." + mRequestURL.substring(7);
112         }
113 
114         // Trick gets Swing's HTML parser
115         Parser parser = (new HTMLEditorKit  () {
116             public Parser getParser()
117             {
118                 return super.getParser();
119             }
120         }).getParser();
121 
122         // Read HTML file into string
123         StringBuffer   sb = new StringBuffer  ();
124         InputStreamReader   isr = new InputStreamReader  (is);
125         BufferedReader   br = new BufferedReader  (isr);
126         try
127         {
128             String   line = null;
129             while ((line = br.readLine()) != null)
130             {
131                 sb.append(line);
132             }
133         }
134         finally
135         {
136             br.close();
137         }
138 
139         // Parse HTML string to find title and start and end position
140         // of the referring excerpt.
141         StringReader   sr = new StringReader  (sb.toString());
142         parser.parse(sr, new LinkbackCallback(), true);
143 
144         if (mStart != 0 && mEnd != 0 && mEnd > mStart)
145         {
146             mExcerpt = sb.toString().substring(mStart, mEnd);
147             mExcerpt = Utilities.removeHTML(mExcerpt);
148 
149             if (mExcerpt.length() > mMaxExcerpt)
150             {
151                 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
152             }
153         }
154 
155         if (mTitle.startsWith(">") && mTitle.length() > 1)
156         {
157             mTitle = mTitle.substring(1);
158         }
159     }
160 
161     //------------------------------------------------------------------------
162     private void extractByParsingRss(String   rssLink, String   requestURL)
163             throws IllegalArgumentException  , MalformedURLException  , FeedException, IOException  
164     {
165         SyndFeedInput feedInput = new SyndFeedInput();       
166         SyndFeed feed = feedInput.build(
167             new InputStreamReader  (new URL  (rssLink).openStream()));
168         Iterator   itemIter = feed.getEntries().iterator();
169         String   feedTitle = feed.getTitle();
170 
171         int count = 0;
172 
173         if (mLogger.isDebugEnabled())
174         {
175             mLogger.debug("Feed parsed, title: " + feedTitle);
176         }
177 
178         while (itemIter.hasNext())
179         {
180             count++;
181             SyndEntry item = (SyndEntry) itemIter.next();
182             if (item.getDescription().getValue().indexOf(requestURL) != -1)
183             {
184                 mFound = true;
185                 mPermalink = item.getLink().toString();
186                 if (feedTitle != null && feedTitle.trim().length() > 0)
187                 {
188                     mTitle = feedTitle + ": " + item.getTitle();
189                 }
190                 else
191                 {
192                     mTitle = item.getTitle();
193                 }
194                 mExcerpt = item.getDescription().getValue();
195                 mExcerpt = Utilities.removeHTML(mExcerpt);
196                 if (mExcerpt.length() > mMaxExcerpt)
197                 {
198                     mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
199                 }
200                 break;
201             }
202         }
203 
204         if (mLogger.isDebugEnabled())
205         {
206             mLogger.debug("Parsed " + count + " articles, found linkback="
207                     + mFound);
208         }
209     }
210 
211     //------------------------------------------------------------------------
212     /**
213      * Returns the excerpt.
214      * 
215      * @return String
216      */
217     public String   getExcerpt()
218     {
219         return mExcerpt;
220     }
221 
222     //------------------------------------------------------------------------
223     /**
224      * Returns the title.
225      * 
226      * @return String
227      */
228     public String   getTitle()
229     {
230         return mTitle;
231     }
232 
233     //------------------------------------------------------------------------
234     /**
235      * Returns the permalink.
236      * 
237      * @return String
238      */
239     public String   getPermalink()
240     {
241         return mPermalink;
242     }
243 
244     //------------------------------------------------------------------------
245     /**
246      * Sets the permalink.
247      * 
248      * @param permalink
249      *            The permalink to set
250      */
251     public void setPermalink(String   permalink)
252     {
253         mPermalink = permalink;
254     }
255 
256     /////////////////////////////////////////////////////////////////////////
257 
258     /**
259      * Parser callback that finds title and excerpt. As we walk through the HTML
260      * tags, we keep track of the most recently encountered divider tag in the
261      * mStart field. Once we find the referring permalink, we set the mFound
262      * flag. After that, we look for the next divider tag and save it's position
263      * in the mEnd field.
264      */
265     private final class LinkbackCallback extends ParserCallback
266     {
267         // Dividers
268         private Tag[] mDivTags    = { Tag.TD, Tag.DIV, Tag.SPAN,
269                                           Tag.BLOCKQUOTE, Tag.P, Tag.LI,
270                                           Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
271                                           Tag.H2, Tag.H3, Tag.H4, Tag.H5,
272                                           Tag.H6 };
273 
274         private List    mList       = Arrays.asList(mDivTags);
275 
276         private Tag   mCurrentTag = null;
277 
278         /**
279          * Look for divider tags and for the permalink.
280          * 
281          * @param tag
282          *            HTML tag
283          * @param atts
284          *            Attributes of that tag
285          * @param pos
286          *            Tag's position in file
287          */
288         public void handleStartTag(Tag tag, MutableAttributeSet   atts, int pos)
289         {
290             if (mList.contains(tag) && !mFound)
291             {
292                 mStart = pos;
293             }
294             else if (mList.contains(tag) && mFound && mEnd == 0)
295             {
296                 mEnd = pos;
297             }
298             else if (tag.equals(Tag.A))
299             {
300                 String   href = (String  ) atts.getAttribute(HTML.Attribute.HREF);
301                 if (href == null)
302                     return;
303                 int hashPos = href.lastIndexOf('#');
304                 if (hashPos != -1)
305                 {
306                     href = href.substring(0, hashPos);
307                 }
308                 if (href != null
309                         && (href.equals(mRequestURL) || href
310                                 .equals(mRequestURLWWW)))
311                 {
312                     mFound = true;
313                 }
314                 else
315                 {
316                     /*
317                      * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
318                      * "+href); }
319                      */
320                 }
321             }
322             mCurrentTag = tag;
323         }
324 
325         /**
326          * Needed to handle SPAN tag.
327          */
328         public void handleSimpleTag(Tag tag, MutableAttributeSet   atts, int pos)
329         {
330             if (mList.contains(tag) && mFound && mEnd == 0)
331             {
332                 mEnd = pos;
333             }
334             else if (tag.equals(Tag.LINK))
335             {
336                 // Look out for RSS autodiscovery link
337                 String   title = (String  ) atts.getAttribute(HTML.Attribute.TITLE);
338                 String   type = (String  ) atts.getAttribute(HTML.Attribute.TYPE);
339                 if (title != null && type != null
340                         && type.equals("application/rss+xml")
341                         && title.equals("RSS"))
342                 {
343                     mRssLink = (String  ) atts.getAttribute(HTML.Attribute.HREF);
344 
345                     if (mLogger.isDebugEnabled())
346                     {
347                         mLogger.debug("Found RSS link " + mRssLink);
348                     }
349 
350                     if (mRssLink.startsWith("/") && mRssLink.length() > 1)
351                     {
352                         try
353                         {
354                             URL   url = new URL  (mRefererURL);
355                             mRssLink = url.getProtocol() + "://"
356                                     + url.getHost() + ":" + url.getPort()
357                                     + mRssLink;
358                         }
359                         catch (MalformedURLException   e)
360                         {
361                             mRssLink = null;
362                             if (mLogger.isDebugEnabled())
363                             {
364                                 mLogger.debug("Determining RSS URL", e);
365                             }
366                         }
367                     }
368                     else if (!mRssLink.startsWith("http"))
369                     {
370                         int slash = mRefererURL.lastIndexOf("/");
371                         if (slash != -1)
372                         {
373                             mRssLink = mRefererURL.substring(0, slash) + "/"
374                                     + mRssLink;
375                         }
376                     }
377                     if (mLogger.isDebugEnabled())
378                     {
379                         mLogger.debug("Qualified RSS link is " + mRssLink);
380                     }
381                 }
382             }
383         }
384 
385         /**
386          * Stop at the very first divider tag after the permalink.
387          * 
388          * @param tag
389          *            End tag
390          * @param pos
391          *            Position in HTML file
392          */
393         public void handleEndTag(Tag tag, int pos)
394         {
395             if (mList.contains(tag) && mFound && mEnd == 0)
396             {
397                 mEnd = pos;
398             }
399             else if (mList.contains(tag) && !mFound)
400             {
401                 mStart = pos;
402             }
403             else
404             {
405                 mCurrentTag = null;
406             }
407         }
408 
409         /**
410          * Get the page title
411          */
412         public void handleText(char[] data, int pos)
413         {
414             if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
415             {
416                 String   newText = new String  (data);
417                 if (mTitle.length() < 50)
418                 {
419                     mTitle += newText;
420                 }
421             }
422         }
423     }
424 }
425 
426
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags