1 package org.roller.util; 2 3 import java.io.BufferedReader ; 4 import java.io.IOException ; 5 import java.io.InputStream ; 6 import java.io.InputStreamReader ; 7 import java.io.StringReader ; 8 import java.net.MalformedURLException ; 9 import java.net.URL ; 10 import java.util.Arrays ; 11 import java.util.Iterator ; 12 import java.util.List ; 13 14 import javax.swing.text.MutableAttributeSet ; 15 import javax.swing.text.html.HTML ; 16 import javax.swing.text.html.HTMLEditorKit ; 17 import javax.swing.text.html.HTML.Tag; 18 import javax.swing.text.html.HTMLEditorKit.Parser; 19 import javax.swing.text.html.HTMLEditorKit.ParserCallback; 20 21 import org.apache.commons.logging.Log; 22 import org.apache.commons.logging.LogFactory; 23 24 import com.sun.syndication.feed.synd.SyndEntry; 25 import com.sun.syndication.feed.synd.SyndFeed; 26 import com.sun.syndication.io.FeedException; 27 import com.sun.syndication.io.SyndFeedInput; 28 29 34 public class LinkbackExtractor 35 { 36 private static Log mLogger = LogFactory.getFactory().getInstance( 37 LinkbackExtractor.class); 38 private boolean mFound = false; 39 private String mTitle = ""; 40 private String mRssLink = null; 41 private String mExcerpt = null; 42 private String mPermalink = null; 43 private int mStart = 0; 44 private int mEnd = 0; 45 private int mMaxExcerpt = 500; private String mRequestURL = null; 47 private String mRequestURLWWW = null; 48 private String mRefererURL; 49 50 57 public LinkbackExtractor(String refererURL, String requestURL) 58 throws MalformedURLException , IOException 59 { 60 try 61 { 62 extractByParsingHtml(refererURL, requestURL); 63 if (mRssLink != null) 64 { 65 extractByParsingRss(mRssLink, requestURL); 66 } 67 } 68 catch (Exception e) 69 { 70 if (mLogger.isDebugEnabled()) 71 { 72 mLogger.debug("Extracting linkback", e); 73 } 74 } 75 } 76 77 private void extractByParsingHtml(String refererURL, String requestURL) 79 throws MalformedURLException , IOException 80 { 81 URL url = new URL (refererURL); 82 InputStream is = url.openStream(); 83 84 mRefererURL = refererURL; 85 86 if (requestURL.startsWith("http://www.")) 87 { 88 mRequestURLWWW = requestURL; 89 mRequestURL = "http://" + mRequestURLWWW.substring(11); 90 } 91 else 92 { 93 mRequestURL = requestURL; 94 mRequestURLWWW = "http://www." + mRequestURL.substring(7); 95 } 96 97 Parser parser = (new HTMLEditorKit () { 99 public Parser getParser() 100 { 101 return super.getParser(); 102 } 103 }).getParser(); 104 105 StringBuffer sb = new StringBuffer (); 107 InputStreamReader isr = new InputStreamReader (is); 108 BufferedReader br = new BufferedReader (isr); 109 try 110 { 111 String line = null; 112 while ((line = br.readLine()) != null) 113 { 114 sb.append(line); 115 } 116 } 117 finally 118 { 119 br.close(); 120 } 121 122 StringReader sr = new StringReader (sb.toString()); 125 parser.parse(sr, new LinkbackCallback(), true); 126 127 if (mStart != 0 && mEnd != 0 && mEnd > mStart) 128 { 129 mExcerpt = sb.toString().substring(mStart, mEnd); 130 mExcerpt = Utilities.removeHTML(mExcerpt); 131 132 if (mExcerpt.length() > mMaxExcerpt) 133 { 134 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "..."; 135 } 136 } 137 138 if (mTitle.startsWith(">") && mTitle.length() > 1) 139 { 140 mTitle = mTitle.substring(1); 141 } 142 } 143 144 private void extractByParsingRss(String rssLink, String requestURL) 146 throws IllegalArgumentException , MalformedURLException , FeedException, IOException 147 { 148 SyndFeedInput feedInput = new SyndFeedInput(); 149 SyndFeed feed = feedInput.build( 150 new InputStreamReader (new URL (rssLink).openStream())); 151 Iterator itemIter = feed.getEntries().iterator(); 152 String feedTitle = feed.getTitle(); 153 154 int count = 0; 155 156 if (mLogger.isDebugEnabled()) 157 { 158 mLogger.debug("Feed parsed, title: " + feedTitle); 159 } 160 161 while (itemIter.hasNext()) 162 { 163 count++; 164 SyndEntry item = (SyndEntry) itemIter.next(); 165 if (item.getDescription().getValue().indexOf(requestURL) != -1) 166 { 167 mFound = true; 168 mPermalink = item.getLink().toString(); 169 if (feedTitle != null && feedTitle.trim().length() > 0) 170 { 171 mTitle = feedTitle + ": " + item.getTitle(); 172 } 173 else 174 { 175 mTitle = item.getTitle(); 176 } 177 mExcerpt = item.getDescription().getValue(); 178 mExcerpt = Utilities.removeHTML(mExcerpt); 179 if (mExcerpt.length() > mMaxExcerpt) 180 { 181 mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "..."; 182 } 183 break; 184 } 185 } 186 187 if (mLogger.isDebugEnabled()) 188 { 189 mLogger.debug("Parsed " + count + " articles, found linkback=" 190 + mFound); 191 } 192 } 193 194 200 public String getExcerpt() 201 { 202 return mExcerpt; 203 } 204 205 211 public String getTitle() 212 { 213 return mTitle; 214 } 215 216 222 public String getPermalink() 223 { 224 return mPermalink; 225 } 226 227 234 public void setPermalink(String permalink) 235 { 236 mPermalink = permalink; 237 } 238 239 241 248 private final class LinkbackCallback extends ParserCallback 249 { 250 private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN, 252 Tag.BLOCKQUOTE, Tag.P, Tag.LI, 253 Tag.BR, Tag.HR, Tag.PRE, Tag.H1, 254 Tag.H2, Tag.H3, Tag.H4, Tag.H5, 255 Tag.H6 }; 256 257 private List mList = Arrays.asList(mDivTags); 258 259 private Tag mCurrentTag = null; 260 261 271 public void handleStartTag(Tag tag, MutableAttributeSet atts, int pos) 272 { 273 if (mList.contains(tag) && !mFound) 274 { 275 mStart = pos; 276 } 277 else if (mList.contains(tag) && mFound && mEnd == 0) 278 { 279 mEnd = pos; 280 } 281 else if (tag.equals(Tag.A)) 282 { 283 String href = (String ) atts.getAttribute(HTML.Attribute.HREF); 284 if (href == null) 285 return; 286 int hashPos = href.lastIndexOf('#'); 287 if (hashPos != -1) 288 { 289 href = href.substring(0, hashPos); 290 } 291 if (href != null 292 && (href.equals(mRequestURL) || href 293 .equals(mRequestURLWWW))) 294 { 295 mFound = true; 296 } 297 else 298 { 299 303 } 304 } 305 mCurrentTag = tag; 306 } 307 308 311 public void handleSimpleTag(Tag tag, MutableAttributeSet atts, int pos) 312 { 313 if (mList.contains(tag) && mFound && mEnd == 0) 314 { 315 mEnd = pos; 316 } 317 else if (tag.equals(Tag.LINK)) 318 { 319 String title = (String ) atts.getAttribute(HTML.Attribute.TITLE); 321 String type = (String ) atts.getAttribute(HTML.Attribute.TYPE); 322 if (title != null && type != null 323 && type.equals("application/rss+xml") 324 && title.equals("RSS")) 325 { 326 mRssLink = (String ) atts.getAttribute(HTML.Attribute.HREF); 327 328 if (mLogger.isDebugEnabled()) 329 { 330 mLogger.debug("Found RSS link " + mRssLink); 331 } 332 333 if (mRssLink.startsWith("/") && mRssLink.length() > 1) 334 { 335 try 336 { 337 URL url = new URL (mRefererURL); 338 mRssLink = url.getProtocol() + "://" 339 + url.getHost() + ":" + url.getPort() 340 + mRssLink; 341 } 342 catch (MalformedURLException e) 343 { 344 mRssLink = null; 345 if (mLogger.isDebugEnabled()) 346 { 347 mLogger.debug("Determining RSS URL", e); 348 } 349 } 350 } 351 else if (!mRssLink.startsWith("http")) 352 { 353 int slash = mRefererURL.lastIndexOf("/"); 354 if (slash != -1) 355 { 356 mRssLink = mRefererURL.substring(0, slash) + "/" 357 + mRssLink; 358 } 359 } 360 if (mLogger.isDebugEnabled()) 361 { 362 mLogger.debug("Qualified RSS link is " + mRssLink); 363 } 364 } 365 } 366 } 367 368 376 public void handleEndTag(Tag tag, int pos) 377 { 378 if (mList.contains(tag) && mFound && mEnd == 0) 379 { 380 mEnd = pos; 381 } 382 else if (mList.contains(tag) && !mFound) 383 { 384 mStart = pos; 385 } 386 else 387 { 388 mCurrentTag = null; 389 } 390 } 391 392 395 public void handleText(char[] data, int pos) 396 { 397 if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE)) 398 { 399 String newText = new String (data); 400 if (mTitle.length() < 50) 401 { 402 mTitle += newText; 403 } 404 } 405 } 406 } 407 } 408 409 | Popular Tags |