scala-news-reader

rss/atom news reader in scala

git clone https://9o.is/git/scala-news-reader.git

package.scala

(6180B)


      1 package com.joereader.lib
      2 
      3 import dispatch._, Defaults._
      4 import org.jsoup._, nodes._
      5 import com.sun.syndication._
      6 import io._
      7 import feed.synd._
      8 
      9 import java.io._
     10 import scala.Some
     11 import scala.collection.JavaConversions._
     12 
     13 import net.liftweb._
     14 import common._
     15 import util.Helpers._
     16 
     17 /**
     18  *
     19  */
     20 package object rss extends Logger {
     21 
     22   type FutureFeedEntry = Future[Either[String, Box[FeedEntry]]]
     23   type FutureFeedEntries = Future[Either[String, List[FeedEntry]]]
     24   type FutureRSSHtmlResponse = Future[Either[String, RSSHtmlResponse]]
     25 
     26   implicit class RssImplicitString(str: String) {
     27 
     28     /**
     29      * Returns the one entry matching the requested guid.
     30      */
     31     def entry(guid: String): FutureFeedEntry = {
     32       for (entries <- entries.right)
     33         yield Box(entries.find(_.guid == guid)) ?~
     34         s"RSS entry could not be found -> guid: $guid"
     35     }
     36 
     37     /**
     38      * Grabs all feed entries given a url to a rss feed.
     39      */
     40     def entries: FutureFeedEntries = {
     41       val response = catchThrowable(requestLink(str))
     42 
     43       for (res <- extractFeed(response).right)
     44         yield Feed.build(res, List(str)).entries
     45     }
     46 
     47     /**
     48      * Returns html given a url to a web page
     49      */
     50     def response: FutureRSSHtmlResponse = {
     51       val req = RSSHtmlResponse(str, "", str)
     52       Future {
     53         requestLink(Some(req)).
     54           toRight(s"Failed to get html response -> $str")
     55       }
     56     }
     57 
     58     /**
     59      * Given a url link, it tries to find all rss links in html head.
     60      * @return list of rss links and the html content.
     61      */
     62     def rssLinks: List[String] = {
     63       findRssLinks(str)
     64     }
     65   }
     66 
     67   implicit class RssImplicitListOfString(strs: List[String]) {
     68     /**
     69      * Given a url link to rss feed, it returns a syndicated feed.
     70      * @return a syndicated feed
     71      */
     72     def feed: Feed =
     73       Feed.merge(strs.
     74         map(l => (catchThrowable(requestLink(l)), l)).
     75         map {
     76           r =>
     77             val (response, link) = r
     78             (extractFeed(response).right.map(
     79               syndFeed => Feed.build(syndFeed, List(link))),
     80               link)
     81         }.
     82         map {
     83           r =>
     84             val (response, link) = r
     85             response() match {
     86               case Left(msg) =>
     87                 warn(s"$msg -> $link")
     88                 Feed.empty
     89               case Right(feed) =>
     90                 feed
     91             }
     92         })
     93   }
     94   
     95   implicit class FutureFeedEntriesHelper(entries: FutureFeedEntries) {
     96     def present: List[FeedEntry] = 
     97       entries() match {
     98         case Left(msg) => warn(msg); Nil
     99         case Right(entries) => entries
    100       }
    101   }
    102 
    103   /**
    104    * Returns a result of the page content (as a string) requested by a link.
    105    * @param link the web link to the page.
    106    * @return either a failure with a message or successful content as a string
    107    */
    108   protected[rss] def requestLink(link: String): Future[Either[Throwable, String]] = {
    109     val request = url(URLFormatter(link).toString)
    110 
    111     Http.configure(_.
    112       setFollowRedirects(true).
    113       setCompressionEnabled(true))(request OK as.String).either
    114   }
    115 
    116   /**
    117    * Returns a result of the page content (as a string) requested by a link.
    118    * Note: Redirects can be handled by dispatch with setFollowRedirects as true,
    119    * but we want to retrieve the redirected url. Limit is 3 redirects.
    120    */
    121   protected[rss] def requestLink(optreq: Option[RSSHtmlResponse], redirectCount: Int = 0): Option[RSSHtmlResponse] = {
    122 
    123     if (optreq.exists(_.redirectTo == null) || redirectCount > 2) return optreq
    124 
    125     optreq.flatMap {
    126       req =>
    127         val request = url(URLFormatter(req.redirectTo).toString)
    128 
    129         val response =
    130           Http.configure(_.setCompressionEnabled(true))(request > {
    131             r =>
    132               if (r.isRedirected)
    133                 RSSHtmlResponse(req.redirectTo, r.getResponseBody, r.getHeader("Location"))
    134               else
    135                 RSSHtmlResponse(req.redirectTo, r.getResponseBody, null)
    136           }).option
    137 
    138         requestLink(response(), redirectCount + 1)
    139     }
    140   }
    141 
    142   protected[rss] def catchThrowable(res: Future[Either[Throwable, String]]): Future[Either[String, String]] =
    143     for (e <- res.left) yield e.getMessage
    144 
    145   /**
    146    * Finds link(s) to RSS feeds in HTML elements.
    147    * @param html html in plain text.
    148    */
    149   protected[rss] def findRssLinks(html: String): List[String] = {
    150 
    151     val document = parseHtml(html)
    152 
    153     /* All the types of rss mimes (including atom.) */
    154     val mimes =
    155       "application/rss+xml" ::
    156         "application/atom+xml" ::
    157         "application/xml" ::
    158         "text/xml" :: Nil
    159 
    160     /* Retrieves a list of elements that match type={mimes} */
    161     def filterByMimeType(elems: List[Element]): List[Element] =
    162       elems.filter(e => mimes.filter(_ == e.attr("type")).nonEmpty)
    163 
    164     /* Remove elements that are likely rss comment feeds. */
    165     def filterNotRssComments(elems: List[Element]): List[Element] =
    166       elems.filterNot(e => e.toString.contains("comment"))
    167 
    168     /* Retrieves the web link from each elements href value. */
    169     def getLinkHrefs(elems: List[Element]): List[String] =
    170       elems.map(_.attr("href").trim)
    171 
    172     getLinkHrefs(
    173       filterNotRssComments(
    174         filterByMimeType(headElements(document))))
    175   }
    176 
    177   protected[rss] def findRssLinks(html: Option[String]): List[String] =
    178     html map findRssLinks getOrElse Nil
    179 
    180   /**
    181    * Takes an xml (as string) and builds a syndicated feed.
    182    * @param xml xml as string.
    183    * @return a syndicated feed if xml if correctly parsed.
    184    */
    185   protected[rss] def extractFeed(xml: String): Either[String, SyndFeed] =
    186     tryo {
    187       val bytes = new ByteArrayInputStream(xml.getBytes("UTF-8"))
    188       val reader = new XmlReader(bytes)
    189       new SyndFeedInput().build(reader)
    190     } toRight "Unable to extract feed"
    191 
    192   protected[rss] def extractFeed(content: Future[Either[String, String]]): Future[Either[String, SyndFeed]] =
    193     for (either <- content)
    194       yield for {
    195       feedStr <- either.right
    196       out <- extractFeed(feedStr).right
    197     } yield out
    198 
    199   protected[rss] def parseHtml(html: String): Document = Jsoup.parse(html)
    200 
    201   protected[rss] def headElements(doc: Document): List[Element] =
    202     doc.head.children.toList
    203 }