scala-news-reader
rss/atom news reader in scala
git clone https://9o.is/git/scala-news-reader.git
FeedEntry.scala
(6323B)
1 package com.joereader.lib.rss
2
3 import com.sun.syndication._
4 import feed.synd._
5 import feed.module.mediarss._, types._
6
7 import org.jsoup._
8 import nodes._
9 import select._
10 import parser._
11 import safety._
12
13 import net.liftweb.util.PCDataXmlParser
14
15 import scala.xml.NodeSeq
16 import scala.collection.JavaConversions._
17 import java.util.Date
18 import java.text.SimpleDateFormat
19
20 import com.joereader.lib._
21
22 import net.liftweb.common._
23
24 /* An RSS Feed Entry/Item. */
25 case class FeedEntry(
26 guid: String,
27 title: String,
28 link: String,
29 date: Date,
30 author: FeedAuthor,
31 images: List[FeedImage],
32 content: NodeSeq) {
33
34 private def df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ")
35 def dateFormatted = df.format(date)
36 def nonAuthorImages = images.filterNot(i => author.imgUrl.exists(_ == i.src))
37 }
38
39 object FeedEntry extends Logger {
40
41 /*
42 * We cheat when possible and remove the following query params from image
43 * sources.
44 */
45 val invalidImgParams = List(
46 "w", "width", "h", "height", "crop", "size", "s")
47
48 /* Image sources containing the following keywords are not allowed. */
49 val invalidImgSources = List(
50 "feeds.feedburner.com", "feedsportal.com", "subscribe")
51
52 def build(entry: SyndEntry) = {
53 val builtContent = content(entry)
54 val builtMediaImages = mediaImages(entry, builtContent)
55
56 FeedEntry(
57 guid(entry),
58 entry.getTitle,
59 entry.getLink,
60 entry.getPublishedDate,
61 author(entry, builtMediaImages),
62 builtMediaImages,
63 builtContent)
64 }
65
66 def empty = FeedEntry("","","", new Date(), FeedAuthor(""), Nil, NodeSeq.Empty)
67
68 def build(entries: List[SyndEntry]): List[FeedEntry] = entries.map(build)
69
70 def build(entries: Option[List[SyndEntry]]): List[FeedEntry] =
71 entries.map(build).getOrElse(Nil)
72
73 /* Create FeedAuthor */
74 def author(entry: SyndEntry, images: List[FeedImage]): FeedAuthor = {
75 import gdata._
76
77 // grab image src located in Author GData namespace (used by blogger.com)
78 val gdataImage = entry.getAuthors.asInstanceOf[java.util.List[SyndPerson]].
79 headOption.map {
80 person =>
81 val mod = person.getModule(GDataModule.URI).asInstanceOf[GDataModule]
82 if (mod != null) mod.getThumbnail else ""
83 }
84
85 val gravatarImage: Option[String] =
86 images.find(_.src.contains("gravatar.com")).map(_.src)
87
88 val img = if(gravatarImage.isDefined) gravatarImage else gdataImage
89
90 FeedAuthor(entry.getAuthor, img)
91 }
92
93 /* Find All images in entry. */
94 def mediaImages(entry: SyndEntry, content: NodeSeq): List[FeedImage] = ({
95 val mediaMod: MediaEntryModule = entry.getModule(MediaModule.URI).
96 asInstanceOf[MediaEntryModule]
97
98 if (mediaMod != null)
99 (for (mediaContent <- mediaMod.getMediaContents) yield {
100 val url = mediaContent.getReference.asInstanceOf[UrlReference].getUrl
101 val src =
102 if (mediaContent.getMedium == "image")
103 URLFormatter(url.toString).
104 urlWithoutQueryParams(invalidImgParams: _*)
105 else
106 ""
107 FeedImage(src)
108 }).toList
109 else Nil
110 } ::: {
111 (content \\ "img").map {
112 img =>
113 val width = (img \ "@width").text
114 val height = (img \ "@height").text
115 val src = (img \ "@src").text
116
117 try {
118 if ((width == "" || width.toInt > 25) &&
119 (height == "" || height.toInt > 25))
120 FeedImage(src)
121 else
122 FeedImage("")
123 } catch {
124 case e: Throwable =>
125 warn("Finding image: "+ e.getMessage)
126 FeedImage("")
127 }
128
129 }.toList
130
131 }).
132 filterNot(_.src == "").
133 filterNot(i =>
134 invalidImgSources.exists(i.src.contains)
135 ). // feedburner's bottom image links
136 groupBy(_.src).
137 map(src => FeedImage(src._1)).
138 toList
139
140 /* Grabs and formats the entry's content. */
141 def content(entry: SyndEntry): NodeSeq = {
142 def content = entry.getContents.asInstanceOf[java.util.List[SyndContent]].
143 headOption.map(_.getValue).getOrElse(entry.getDescription.getValue)
144
145 def whitelist = Whitelist.relaxed.
146 addTags("iframe").
147 addAttributes("iframe", "src", "frameborder", "allowfullscreen").
148 addEnforcedAttribute("a", "rel", "nofollow").
149 addEnforcedAttribute("a", "target", "_blank")
150
151 def unsanitized = Jsoup.parse("<div>" + content + "</div>").body
152 def imgUnsanitized = formatImages(unsanitized)
153 def vidUnsanitized = formatVideos(imgUnsanitized)
154 def semisanitized = Jsoup.clean(vidUnsanitized.toString, whitelist)
155 def sanitized = Jsoup.parse(semisanitized).body.children.addClass("content")
156 PCDataXmlParser(sanitized.toString) openOr NodeSeq.Empty
157 }
158
159 def formatImages(content: Element): Element = {
160 val imgs: Elements = content.select("img")
161
162 for (img <- imgs) {
163 try {
164 val link = URLFormatter(img.attr("src")).
165 urlWithoutQueryParams(invalidImgParams: _*)
166
167 img.attr("src", link)
168
169 val w = "width"
170 val h = "height"
171 if (img.attr(w) != "" && img.attr(w).toInt > 25) img.attr(w, "")
172 if (img.attr(h) != "" && img.attr(h).toInt > 25) img.attr(h, "")
173 } catch {
174 case e: Throwable =>
175 warn("Formatting image: "+ e.getMessage)
176 }
177 }
178 content
179 }
180
181 def formatVideos(content: Element): Element = {
182
183 def emptyElem = new Element(Tag.valueOf("span"), "")
184 val iframes: Elements = content.select("iframe")
185
186 // only allow embedded youtube, vimeo videos
187 val validSrc =
188 "http://www.youtube.com/embed/" ::
189 "http://player.vimeo.com/video/" :: Nil
190
191 for (iframe <- iframes) {
192 try {
193 val src = iframe.attr("src")
194 if (validSrc.exists(s => src.startsWith(s))) iframe.html("")
195 else iframe.replaceWith(emptyElem)
196 } catch {
197 case e: Throwable =>
198 warn("Formatting video: "+ e.getMessage)
199 }
200 }
201 content
202 }
203
204 /**
205 * GUID. If not provided, we create our own with
206 * title and published date.
207 */
208 def guid(entry: SyndEntry): String = {
209 if (entry.getUri == "" || entry.getUri == null)
210 entry.getTitle + entry.getPublishedDate
211 else
212 entry.getUri
213 }
214 }