scala-news-reader
rss/atom news reader in scala
git clone https://9o.is/git/scala-news-reader.git
package.scala
(6180B)
1 package com.joereader.lib
2
3 import dispatch._, Defaults._
4 import org.jsoup._, nodes._
5 import com.sun.syndication._
6 import io._
7 import feed.synd._
8
9 import java.io._
10 import scala.Some
11 import scala.collection.JavaConversions._
12
13 import net.liftweb._
14 import common._
15 import util.Helpers._
16
17 /**
18 *
19 */
20 package object rss extends Logger {
21
22 type FutureFeedEntry = Future[Either[String, Box[FeedEntry]]]
23 type FutureFeedEntries = Future[Either[String, List[FeedEntry]]]
24 type FutureRSSHtmlResponse = Future[Either[String, RSSHtmlResponse]]
25
26 implicit class RssImplicitString(str: String) {
27
28 /**
29 * Returns the one entry matching the requested guid.
30 */
31 def entry(guid: String): FutureFeedEntry = {
32 for (entries <- entries.right)
33 yield Box(entries.find(_.guid == guid)) ?~
34 s"RSS entry could not be found -> guid: $guid"
35 }
36
37 /**
38 * Grabs all feed entries given a url to a rss feed.
39 */
40 def entries: FutureFeedEntries = {
41 val response = catchThrowable(requestLink(str))
42
43 for (res <- extractFeed(response).right)
44 yield Feed.build(res, List(str)).entries
45 }
46
47 /**
48 * Returns html given a url to a web page
49 */
50 def response: FutureRSSHtmlResponse = {
51 val req = RSSHtmlResponse(str, "", str)
52 Future {
53 requestLink(Some(req)).
54 toRight(s"Failed to get html response -> $str")
55 }
56 }
57
58 /**
59 * Given a url link, it tries to find all rss links in html head.
60 * @return list of rss links and the html content.
61 */
62 def rssLinks: List[String] = {
63 findRssLinks(str)
64 }
65 }
66
67 implicit class RssImplicitListOfString(strs: List[String]) {
68 /**
69 * Given a url link to rss feed, it returns a syndicated feed.
70 * @return a syndicated feed
71 */
72 def feed: Feed =
73 Feed.merge(strs.
74 map(l => (catchThrowable(requestLink(l)), l)).
75 map {
76 r =>
77 val (response, link) = r
78 (extractFeed(response).right.map(
79 syndFeed => Feed.build(syndFeed, List(link))),
80 link)
81 }.
82 map {
83 r =>
84 val (response, link) = r
85 response() match {
86 case Left(msg) =>
87 warn(s"$msg -> $link")
88 Feed.empty
89 case Right(feed) =>
90 feed
91 }
92 })
93 }
94
95 implicit class FutureFeedEntriesHelper(entries: FutureFeedEntries) {
96 def present: List[FeedEntry] =
97 entries() match {
98 case Left(msg) => warn(msg); Nil
99 case Right(entries) => entries
100 }
101 }
102
103 /**
104 * Returns a result of the page content (as a string) requested by a link.
105 * @param link the web link to the page.
106 * @return either a failure with a message or successful content as a string
107 */
108 protected[rss] def requestLink(link: String): Future[Either[Throwable, String]] = {
109 val request = url(URLFormatter(link).toString)
110
111 Http.configure(_.
112 setFollowRedirects(true).
113 setCompressionEnabled(true))(request OK as.String).either
114 }
115
116 /**
117 * Returns a result of the page content (as a string) requested by a link.
118 * Note: Redirects can be handled by dispatch with setFollowRedirects as true,
119 * but we want to retrieve the redirected url. Limit is 3 redirects.
120 */
121 protected[rss] def requestLink(optreq: Option[RSSHtmlResponse], redirectCount: Int = 0): Option[RSSHtmlResponse] = {
122
123 if (optreq.exists(_.redirectTo == null) || redirectCount > 2) return optreq
124
125 optreq.flatMap {
126 req =>
127 val request = url(URLFormatter(req.redirectTo).toString)
128
129 val response =
130 Http.configure(_.setCompressionEnabled(true))(request > {
131 r =>
132 if (r.isRedirected)
133 RSSHtmlResponse(req.redirectTo, r.getResponseBody, r.getHeader("Location"))
134 else
135 RSSHtmlResponse(req.redirectTo, r.getResponseBody, null)
136 }).option
137
138 requestLink(response(), redirectCount + 1)
139 }
140 }
141
142 protected[rss] def catchThrowable(res: Future[Either[Throwable, String]]): Future[Either[String, String]] =
143 for (e <- res.left) yield e.getMessage
144
145 /**
146 * Finds link(s) to RSS feeds in HTML elements.
147 * @param html html in plain text.
148 */
149 protected[rss] def findRssLinks(html: String): List[String] = {
150
151 val document = parseHtml(html)
152
153 /* All the types of rss mimes (including atom.) */
154 val mimes =
155 "application/rss+xml" ::
156 "application/atom+xml" ::
157 "application/xml" ::
158 "text/xml" :: Nil
159
160 /* Retrieves a list of elements that match type={mimes} */
161 def filterByMimeType(elems: List[Element]): List[Element] =
162 elems.filter(e => mimes.filter(_ == e.attr("type")).nonEmpty)
163
164 /* Remove elements that are likely rss comment feeds. */
165 def filterNotRssComments(elems: List[Element]): List[Element] =
166 elems.filterNot(e => e.toString.contains("comment"))
167
168 /* Retrieves the web link from each elements href value. */
169 def getLinkHrefs(elems: List[Element]): List[String] =
170 elems.map(_.attr("href").trim)
171
172 getLinkHrefs(
173 filterNotRssComments(
174 filterByMimeType(headElements(document))))
175 }
176
177 protected[rss] def findRssLinks(html: Option[String]): List[String] =
178 html map findRssLinks getOrElse Nil
179
180 /**
181 * Takes an xml (as string) and builds a syndicated feed.
182 * @param xml xml as string.
183 * @return a syndicated feed if xml if correctly parsed.
184 */
185 protected[rss] def extractFeed(xml: String): Either[String, SyndFeed] =
186 tryo {
187 val bytes = new ByteArrayInputStream(xml.getBytes("UTF-8"))
188 val reader = new XmlReader(bytes)
189 new SyndFeedInput().build(reader)
190 } toRight "Unable to extract feed"
191
192 protected[rss] def extractFeed(content: Future[Either[String, String]]): Future[Either[String, SyndFeed]] =
193 for (either <- content)
194 yield for {
195 feedStr <- either.right
196 out <- extractFeed(feedStr).right
197 } yield out
198
199 protected[rss] def parseHtml(html: String): Document = Jsoup.parse(html)
200
201 protected[rss] def headElements(doc: Document): List[Element] =
202 doc.head.children.toList
203 }