scala-news-reader

rss/atom news reader in scala

git clone https://9o.is/git/scala-news-reader.git

FeedEntry.scala

(6323B)


      1 package com.joereader.lib.rss
      2 
      3 import com.sun.syndication._
      4 import feed.synd._
      5 import feed.module.mediarss._, types._
      6 
      7 import org.jsoup._
      8 import nodes._
      9 import select._
     10 import parser._
     11 import safety._
     12 
     13 import net.liftweb.util.PCDataXmlParser
     14 
     15 import scala.xml.NodeSeq
     16 import scala.collection.JavaConversions._
     17 import java.util.Date
     18 import java.text.SimpleDateFormat
     19 
     20 import com.joereader.lib._
     21 
     22 import net.liftweb.common._
     23 
     24 /* An RSS Feed Entry/Item. */
     25 case class FeedEntry(
     26                       guid: String,
     27                       title: String,
     28                       link: String,
     29                       date: Date,
     30                       author: FeedAuthor,
     31                       images: List[FeedImage],
     32                       content: NodeSeq) {
     33 
     34   private def df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ")
     35   def dateFormatted = df.format(date)
     36   def nonAuthorImages = images.filterNot(i => author.imgUrl.exists(_ == i.src))
     37 }
     38 
     39 object FeedEntry extends Logger {
     40 
     41   /*
     42    * We cheat when possible and remove the following query params from image
     43    * sources.
     44    */
     45   val invalidImgParams = List(
     46       "w", "width", "h", "height", "crop", "size", "s")
     47   
     48   /* Image sources containing the following keywords are not allowed. */
     49   val invalidImgSources = List(
     50       "feeds.feedburner.com", "feedsportal.com", "subscribe")
     51 
     52   def build(entry: SyndEntry) = {
     53     val builtContent = content(entry)
     54     val builtMediaImages = mediaImages(entry, builtContent)
     55 
     56     FeedEntry(
     57       guid(entry),
     58       entry.getTitle,
     59       entry.getLink,
     60       entry.getPublishedDate,
     61       author(entry, builtMediaImages),
     62       builtMediaImages,
     63       builtContent)
     64   }
     65   
     66   def empty = FeedEntry("","","", new Date(), FeedAuthor(""), Nil, NodeSeq.Empty)
     67 
     68   def build(entries: List[SyndEntry]): List[FeedEntry] = entries.map(build)
     69 
     70   def build(entries: Option[List[SyndEntry]]): List[FeedEntry] =
     71     entries.map(build).getOrElse(Nil)
     72 
     73   /* Create FeedAuthor */
     74   def author(entry: SyndEntry, images: List[FeedImage]): FeedAuthor = {
     75     import gdata._
     76 
     77     // grab image src located in Author GData namespace (used by blogger.com)
     78     val gdataImage = entry.getAuthors.asInstanceOf[java.util.List[SyndPerson]].
     79       headOption.map {
     80       person =>
     81         val mod = person.getModule(GDataModule.URI).asInstanceOf[GDataModule]
     82         if (mod != null) mod.getThumbnail else ""
     83     }
     84 
     85     val gravatarImage: Option[String] =
     86       images.find(_.src.contains("gravatar.com")).map(_.src)
     87 
     88     val img = if(gravatarImage.isDefined) gravatarImage else gdataImage
     89 
     90     FeedAuthor(entry.getAuthor, img)
     91   }
     92 
     93   /* Find All images in entry. */
     94   def mediaImages(entry: SyndEntry, content: NodeSeq): List[FeedImage] = ({
     95     val mediaMod: MediaEntryModule = entry.getModule(MediaModule.URI).
     96       asInstanceOf[MediaEntryModule]
     97 
     98     if (mediaMod != null)
     99       (for (mediaContent <- mediaMod.getMediaContents) yield {
    100         val url = mediaContent.getReference.asInstanceOf[UrlReference].getUrl
    101         val src =
    102           if (mediaContent.getMedium == "image")
    103             URLFormatter(url.toString).
    104               urlWithoutQueryParams(invalidImgParams: _*)
    105           else
    106             ""
    107         FeedImage(src)
    108       }).toList
    109     else Nil
    110   } ::: {
    111     (content \\ "img").map {
    112       img =>
    113         val width = (img \ "@width").text
    114         val height = (img \ "@height").text
    115         val src = (img \ "@src").text
    116 
    117         try {
    118           if ((width == "" || width.toInt > 25) &&
    119             (height == "" || height.toInt > 25))
    120             FeedImage(src)
    121           else
    122             FeedImage("")
    123         } catch {
    124           case e: Throwable => 
    125             warn("Finding image: "+ e.getMessage)
    126             FeedImage("")
    127         }
    128 
    129     }.toList
    130 
    131   }).
    132     filterNot(_.src == "").
    133     filterNot(i =>
    134       invalidImgSources.exists(i.src.contains)
    135     ). // feedburner's bottom image links
    136     groupBy(_.src).
    137     map(src => FeedImage(src._1)).
    138     toList
    139 
    140   /* Grabs and formats the entry's content. */
    141   def content(entry: SyndEntry): NodeSeq = {
    142     def content = entry.getContents.asInstanceOf[java.util.List[SyndContent]].
    143       headOption.map(_.getValue).getOrElse(entry.getDescription.getValue)
    144 
    145     def whitelist = Whitelist.relaxed.
    146       addTags("iframe").
    147       addAttributes("iframe", "src", "frameborder", "allowfullscreen").
    148       addEnforcedAttribute("a", "rel", "nofollow").
    149       addEnforcedAttribute("a", "target", "_blank")
    150 
    151     def unsanitized = Jsoup.parse("<div>" + content + "</div>").body
    152     def imgUnsanitized = formatImages(unsanitized)
    153     def vidUnsanitized = formatVideos(imgUnsanitized)
    154     def semisanitized = Jsoup.clean(vidUnsanitized.toString, whitelist)
    155     def sanitized = Jsoup.parse(semisanitized).body.children.addClass("content")
    156     PCDataXmlParser(sanitized.toString) openOr NodeSeq.Empty
    157   }
    158 
    159   def formatImages(content: Element): Element = {
    160     val imgs: Elements = content.select("img")
    161 
    162     for (img <- imgs) {
    163       try {
    164         val link = URLFormatter(img.attr("src")).
    165           urlWithoutQueryParams(invalidImgParams: _*)
    166 
    167         img.attr("src", link)
    168 
    169         val w = "width"
    170         val h = "height"
    171         if (img.attr(w) != "" && img.attr(w).toInt > 25) img.attr(w, "")
    172         if (img.attr(h) != "" && img.attr(h).toInt > 25) img.attr(h, "")
    173       } catch {
    174         case e: Throwable =>
    175           warn("Formatting image: "+ e.getMessage)
    176       }
    177     }
    178     content
    179   }
    180 
    181   def formatVideos(content: Element): Element = {
    182 
    183     def emptyElem = new Element(Tag.valueOf("span"), "")
    184     val iframes: Elements = content.select("iframe")
    185 
    186     // only allow embedded youtube, vimeo videos
    187     val validSrc = 
    188       "http://www.youtube.com/embed/" :: 
    189       "http://player.vimeo.com/video/" :: Nil
    190 
    191     for (iframe <- iframes) {
    192       try {
    193         val src = iframe.attr("src")
    194         if (validSrc.exists(s => src.startsWith(s))) iframe.html("")
    195         else iframe.replaceWith(emptyElem)
    196       } catch {
    197         case e: Throwable =>
    198           warn("Formatting video: "+ e.getMessage)
    199       }
    200     }
    201     content
    202   }
    203 
    204   /**
    205    * GUID. If not provided, we create our own with 
    206    * title and published date.
    207    */
    208   def guid(entry: SyndEntry): String = {
    209     if (entry.getUri == "" || entry.getUri == null)
    210       entry.getTitle + entry.getPublishedDate
    211     else
    212       entry.getUri
    213   }
    214 }