2024-08-27 20:46:11 -07:00
|
|
|
|
//
|
|
|
|
|
// RSSParser.swift
|
|
|
|
|
// RSParser
|
|
|
|
|
//
|
|
|
|
|
// Created by Brent Simmons on 6/25/17.
|
|
|
|
|
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
import Foundation
|
|
|
|
|
import SAX
|
2024-09-08 11:59:27 -07:00
|
|
|
|
import DateParser
|
2024-08-27 20:46:11 -07:00
|
|
|
|
|
|
|
|
|
public final class RSSParser {
|
|
|
|
|
|
2024-09-02 12:03:24 -07:00
|
|
|
|
private var parserData: ParserData
|
|
|
|
|
private var feedURL: String {
|
|
|
|
|
parserData.url
|
|
|
|
|
}
|
|
|
|
|
private var data: Data {
|
|
|
|
|
parserData.data
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private let feed: RSSFeed
|
|
|
|
|
private var articles = [RSSArticle]()
|
|
|
|
|
private var currentArticle: RSSArticle? {
|
|
|
|
|
articles.last
|
|
|
|
|
}
|
2024-08-27 20:46:11 -07:00
|
|
|
|
|
2024-09-02 12:03:24 -07:00
|
|
|
|
private var endRSSFound = false
|
|
|
|
|
private var isRDF = false
|
|
|
|
|
private var parsingArticle = false
|
|
|
|
|
private var parsingChannelImage = false
|
|
|
|
|
private var parsingAuthor = false
|
2024-09-08 11:59:27 -07:00
|
|
|
|
private var currentAttributes: SAXParser.XMLAttributesDictionary?
|
2024-09-02 12:03:24 -07:00
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
static func parsedFeed(with parserData: ParserData) -> RSSFeed {
|
2024-08-27 20:46:11 -07:00
|
|
|
|
|
|
|
|
|
let parser = RSSParser(parserData)
|
|
|
|
|
parser.parse()
|
2024-09-02 12:03:24 -07:00
|
|
|
|
return parser.feed
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
init(_ parserData: ParserData) {
|
|
|
|
|
self.parserData = parserData
|
|
|
|
|
self.feed = RSSFeed(urlString: parserData.url)
|
2024-08-27 20:46:11 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
2024-09-02 12:03:24 -07:00
|
|
|
|
|
|
|
|
|
private extension RSSParser {
|
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
func parse() {
|
|
|
|
|
|
|
|
|
|
let saxParser = SAXParser(delegate: self, data: data)
|
|
|
|
|
saxParser.parse()
|
2024-09-09 21:49:46 -07:00
|
|
|
|
feed.articles = articles
|
2024-09-09 20:54:42 -07:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-02 12:03:24 -07:00
|
|
|
|
private struct XMLName {
|
|
|
|
|
static let uppercaseRDF = "RDF".utf8CString
|
|
|
|
|
static let item = "item".utf8CString
|
|
|
|
|
static let guid = "guid".utf8CString
|
|
|
|
|
static let enclosure = "enclosure".utf8CString
|
|
|
|
|
static let image = "image".utf8CString
|
|
|
|
|
static let author = "author".utf8CString
|
|
|
|
|
static let rss = "rss".utf8CString
|
|
|
|
|
static let link = "link".utf8CString
|
|
|
|
|
static let title = "title".utf8CString
|
|
|
|
|
static let language = "language".utf8CString
|
|
|
|
|
static let dc = "dc".utf8CString
|
|
|
|
|
static let content = "content".utf8CString
|
|
|
|
|
static let encoded = "encoded".utf8CString
|
2024-09-09 20:54:42 -07:00
|
|
|
|
static let creator = "creator".utf8CString
|
|
|
|
|
static let date = "date".utf8CString
|
|
|
|
|
static let pubDate = "pubDate".utf8CString
|
|
|
|
|
static let description = "description".utf8CString
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
func addFeedElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) {
|
2024-09-02 12:03:24 -07:00
|
|
|
|
|
|
|
|
|
guard prefix == nil else {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if SAXEqualTags(localName, XMLName.link) {
|
|
|
|
|
if feed.link == nil {
|
2024-09-09 20:54:42 -07:00
|
|
|
|
feed.link = saxParser.currentString
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.title) {
|
2024-09-09 20:54:42 -07:00
|
|
|
|
feed.title = saxParser.currentString
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.language) {
|
2024-09-09 20:54:42 -07:00
|
|
|
|
feed.language = saxParser.currentString
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func addArticle() {
|
|
|
|
|
let article = RSSArticle(feedURL)
|
|
|
|
|
articles.append(article)
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-08 11:59:27 -07:00
|
|
|
|
func addArticleElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) {
|
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
guard let currentArticle else {
|
|
|
|
|
return
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
if let prefix, SAXEqualTags(prefix, XMLName.dc) {
|
|
|
|
|
addDCElement(saxParser, localName, currentArticle)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if let prefix, SAXEqualTags(prefix, XMLName.content) && SAXEqualTags(localName, XMLName.encoded) {
|
|
|
|
|
if let currentString = saxParser.currentString, !currentString.isEmpty {
|
2024-09-02 12:03:24 -07:00
|
|
|
|
currentArticle.body = currentString
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
guard prefix == nil else {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
if let currentString = saxParser.currentString {
|
|
|
|
|
if SAXEqualTags(localName, XMLName.guid) {
|
|
|
|
|
addGuid(currentString, currentArticle)
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.author) {
|
|
|
|
|
addAuthorWithString(currentString, currentArticle)
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.link) {
|
|
|
|
|
currentArticle.link = urlString(currentString)
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.description) {
|
|
|
|
|
if currentArticle.body == nil {
|
|
|
|
|
currentArticle.body = currentString
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if !parsingAuthor && SAXEqualTags(localName, XMLName.title) {
|
|
|
|
|
currentArticle.title = currentString
|
|
|
|
|
}
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.pubDate) {
|
2024-09-08 11:59:27 -07:00
|
|
|
|
currentArticle.datePublished = currentDate(saxParser)
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
2024-09-09 20:54:42 -07:00
|
|
|
|
else if SAXEqualTags(localName, XMLName.enclosure), let currentAttributes {
|
|
|
|
|
addEnclosure(currentAttributes, currentArticle)
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
2024-09-09 20:54:42 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func addDCElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ currentArticle: RSSArticle) {
|
|
|
|
|
|
|
|
|
|
if SAXEqualTags(localName, XMLName.creator) {
|
|
|
|
|
if let currentString = saxParser.currentString {
|
|
|
|
|
addAuthorWithString(currentString, currentArticle)
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
2024-09-09 20:54:42 -07:00
|
|
|
|
else if SAXEqualTags(localName, XMLName.date) {
|
|
|
|
|
currentArticle.datePublished = currentDate(saxParser)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static let isPermalinkKey = "isPermaLink"
|
|
|
|
|
static let isPermalinkLowercaseKey = "ispermalink"
|
|
|
|
|
static let falseValue = "false"
|
|
|
|
|
|
|
|
|
|
func addGuid(_ guid: String, _ currentArticle: RSSArticle) {
|
|
|
|
|
|
|
|
|
|
currentArticle.guid = guid
|
|
|
|
|
|
|
|
|
|
guard let currentAttributes else {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let isPermaLinkValue: String? = {
|
|
|
|
|
|
|
|
|
|
if let value = currentAttributes[Self.isPermalinkKey] {
|
|
|
|
|
return value
|
|
|
|
|
}
|
|
|
|
|
// Allow for `ispermalink`, `isPermalink`, etc.
|
|
|
|
|
for (key, value) in currentAttributes {
|
|
|
|
|
if key.lowercased() == Self.isPermalinkLowercaseKey {
|
|
|
|
|
return value
|
|
|
|
|
}
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
2024-09-09 20:54:42 -07:00
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
// Spec: `isPermaLink is optional, its default value is true.`
|
|
|
|
|
// https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
|
|
|
|
|
// Return only if non-nil and equal to false — otherwise it’s a permalink.
|
|
|
|
|
if let isPermaLinkValue, isPermaLinkValue == Self.falseValue {
|
|
|
|
|
return
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
2024-09-09 20:54:42 -07:00
|
|
|
|
|
|
|
|
|
// Feed bug found in the wild: using a guid that’s not really a permalink
|
|
|
|
|
// and not realizing that `isPermaLink` is true by default.
|
|
|
|
|
if stringIsProbablyAURLOrRelativePath(guid) {
|
|
|
|
|
currentArticle.permalink = urlString(guid)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func stringIsProbablyAURLOrRelativePath(_ s: String) -> Bool {
|
|
|
|
|
|
|
|
|
|
// The RSS guid is defined as a permalink, except when it appears like this:
|
|
|
|
|
// `<guid isPermaLink="false">some—identifier</guid>`
|
|
|
|
|
// However, people often seem to think it’s *not* a permalink by default, even
|
|
|
|
|
// though it is. So we try to detect the situation where the value is not a URL string,
|
|
|
|
|
// and not even a relative path. This may need to evolve over time.
|
|
|
|
|
|
|
|
|
|
if !s.contains("/") {
|
|
|
|
|
// This seems to be just about the best possible check.
|
|
|
|
|
// Bad guids are often just integers, for instance.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if s.lowercased().hasPrefix("tag:") {
|
|
|
|
|
// A common non-URL guid form starts with `tag:`.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Do best attempt at turning a string into a URL string.
|
|
|
|
|
///
|
|
|
|
|
/// If it already appears to be a URL, return it.
|
|
|
|
|
/// Otherwise, treat it like a relative URL and resolve using
|
|
|
|
|
/// the URL of the home page of the feed (if available)
|
|
|
|
|
/// or the URL of the feed.
|
|
|
|
|
///
|
|
|
|
|
/// The returned value is not guaranteed to be a valid URL string.
|
|
|
|
|
/// It’s a best attempt without going to heroic lengths.
|
|
|
|
|
func urlString(_ s: String) -> String {
|
|
|
|
|
|
|
|
|
|
if s.lowercased().hasPrefix("http") {
|
|
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let baseURLString = feed.link ?? feedURL
|
|
|
|
|
guard let baseURL = URL(string: baseURLString) else {
|
|
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
guard let resolvedURL = URL(string: s, relativeTo: baseURL) else {
|
|
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return resolvedURL.absoluteString
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func addAuthorWithString(_ authorString: String, _ currentArticle: RSSArticle) {
|
|
|
|
|
|
|
|
|
|
if authorString.isEmpty {
|
|
|
|
|
return
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
2024-09-09 20:54:42 -07:00
|
|
|
|
|
|
|
|
|
let author = RSSAuthor(singleString: authorString)
|
|
|
|
|
currentArticle.addAuthor(author)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private struct EnclosureKey {
|
|
|
|
|
static let url = "url"
|
|
|
|
|
static let length = "length"
|
|
|
|
|
static let type = "type"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func addEnclosure(_ attributes: SAXParser.XMLAttributesDictionary, _ currentArticle: RSSArticle) {
|
|
|
|
|
|
|
|
|
|
guard let url = attributes[EnclosureKey.url], !url.isEmpty else {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let enclosure = RSSEnclosure(url: url)
|
|
|
|
|
if let lengthValue = attributes[EnclosureKey.length], let length = Int(lengthValue) {
|
|
|
|
|
enclosure.length = length
|
|
|
|
|
}
|
|
|
|
|
enclosure.mimeType = attributes[EnclosureKey.type]
|
|
|
|
|
|
|
|
|
|
currentArticle.addEnclosure(enclosure)
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
2024-09-08 11:59:27 -07:00
|
|
|
|
|
|
|
|
|
func currentDate(_ saxParser: SAXParser) -> Date? {
|
|
|
|
|
|
|
|
|
|
guard let data = saxParser.currentCharacters else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
return DateParser.date(data: data)
|
|
|
|
|
}
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extension RSSParser: SAXParserDelegate {
|
|
|
|
|
|
2024-09-09 21:35:51 -07:00
|
|
|
|
static let rdfAbout = "rdf:about"
|
|
|
|
|
|
2024-09-02 12:03:24 -07:00
|
|
|
|
public func saxParser(_ saxParser: SAXParser, xmlStartElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
|
|
|
|
|
|
|
|
|
|
if endRSSFound {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if SAXEqualTags(localName, XMLName.uppercaseRDF) {
|
|
|
|
|
isRDF = true
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-09 20:54:42 -07:00
|
|
|
|
var xmlAttributes: SAXParser.XMLAttributesDictionary? = nil
|
|
|
|
|
if (isRDF && SAXEqualTags(localName, XMLName.item)) || SAXEqualTags(localName, XMLName.guid) || SAXEqualTags(localName, XMLName.enclosure) {
|
2024-09-02 12:03:24 -07:00
|
|
|
|
xmlAttributes = saxParser.attributesDictionary(attributes, attributeCount: attributeCount)
|
|
|
|
|
}
|
|
|
|
|
if currentAttributes != xmlAttributes {
|
|
|
|
|
currentAttributes = xmlAttributes
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if prefix == nil && SAXEqualTags(localName, XMLName.item) {
|
|
|
|
|
addArticle()
|
|
|
|
|
parsingArticle = true
|
|
|
|
|
|
2024-09-09 21:35:51 -07:00
|
|
|
|
if isRDF, let rdfGuid = xmlAttributes?[Self.rdfAbout], let currentArticle { // RSS 1.0 guid
|
2024-09-02 12:03:24 -07:00
|
|
|
|
currentArticle.guid = rdfGuid
|
|
|
|
|
currentArticle.permalink = rdfGuid
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if prefix == nil && SAXEqualTags(localName, XMLName.image) {
|
|
|
|
|
parsingChannelImage = true
|
|
|
|
|
}
|
|
|
|
|
else if prefix == nil && SAXEqualTags(localName, XMLName.author) {
|
|
|
|
|
if parsingArticle {
|
|
|
|
|
parsingAuthor = true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !parsingChannelImage {
|
|
|
|
|
saxParser.beginStoringCharacters()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public func saxParser(_ saxParser: SAXParser, xmlEndElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
|
|
|
|
|
|
|
|
|
|
if endRSSFound {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if isRDF && SAXEqualTags(localName, XMLName.uppercaseRDF) {
|
|
|
|
|
endRSSFound = true
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.rss) {
|
|
|
|
|
endRSSFound = true
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.image) {
|
|
|
|
|
parsingChannelImage = false
|
|
|
|
|
}
|
|
|
|
|
else if SAXEqualTags(localName, XMLName.item) {
|
|
|
|
|
parsingArticle = false
|
|
|
|
|
}
|
|
|
|
|
else if parsingArticle {
|
2024-09-08 11:59:27 -07:00
|
|
|
|
addArticleElement(saxParser, localName, prefix)
|
2024-09-02 12:03:24 -07:00
|
|
|
|
if SAXEqualTags(localName, XMLName.author) {
|
|
|
|
|
parsingAuthor = false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if !parsingChannelImage {
|
2024-09-09 21:35:51 -07:00
|
|
|
|
addFeedElement(saxParser, localName, prefix)
|
2024-09-02 12:03:24 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public func saxParser(_ saxParser: SAXParser, xmlCharactersFound: XMLPointer, count: Int) {
|
|
|
|
|
|
|
|
|
|
// Required method.
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|