Handle relative <link> elements in Atom parser — use xml:base when present to resolve. Add test case.

This commit is contained in:
Brent Simmons 2025-01-20 17:59:39 -08:00
parent ccb1b8294e
commit d22f3819cd
3 changed files with 52 additions and 8 deletions

View File

@ -25,6 +25,7 @@ final class AtomParser {
attributesStack.last
}
private var xmlBaseURL: URL?
private var parsingXHTML = false
private var xhtmlString: String?
@ -90,6 +91,7 @@ private extension AtomParser {
static let type = "type"
static let length = "length"
static let xmlLang = "xml:lang"
static let xmlBase = "xml:base"
}
func currentString(_ saxParser: SAXParser) -> String? {
@ -139,13 +141,23 @@ private extension AtomParser {
}
}
func addFeedLanguage() {
func addFeedAttributes() {
guard feed.language == nil, let currentAttributes else {
guard let currentAttributes else {
return
}
feed.language = currentAttributes[XMLString.xmlLang]
if feed.language == nil {
feed.language = currentAttributes[XMLString.xmlLang]
}
if xmlBaseURL == nil {
if let xmlBase = currentAttributes[XMLString.xmlBase] {
if let baseURL = URL(string: xmlBase) {
xmlBaseURL = baseURL
}
}
}
}
func addArticle() {
@ -225,7 +237,8 @@ private extension AtomParser {
guard let urlString = attributes[XMLString.href], !urlString.isEmpty else {
return
}
let resolvedURLString = linkResolvedAgainstXMLBase(urlString)
var rel = attributes[XMLString.rel]
if rel?.isEmpty ?? true {
rel = XMLString.alternate
@ -233,21 +246,33 @@ private extension AtomParser {
if rel == XMLString.related {
if article.link == nil {
article.link = urlString
article.link = resolvedURLString
}
}
else if rel == XMLString.alternate {
if article.permalink == nil {
article.permalink = urlString
article.permalink = resolvedURLString
}
}
else if rel == XMLString.enclosure {
if let enclosure = enclosure(urlString, attributes) {
if let enclosure = enclosure(resolvedURLString, attributes) {
article.addEnclosure(enclosure)
}
}
}
func linkResolvedAgainstXMLBase(_ urlString: String) -> String {
guard let xmlBaseURL else {
return urlString
}
if let resolvedURL = URL(string: urlString, relativeTo: xmlBaseURL) {
return resolvedURL.absoluteString
}
return urlString
}
func enclosure(_ urlString: String, _ attributes: StringDictionary) -> RSSEnclosure? {
let enclosure = RSSEnclosure(url: urlString)
@ -351,7 +376,7 @@ extension AtomParser: SAXParserDelegate {
}
if SAXEqualTags(localName, XMLName.feed) {
addFeedLanguage()
addFeedAttributes()
}
saxParser.beginStoringCharacters()

View File

@ -109,4 +109,22 @@ final class AtomParserTests: XCTestCase {
XCTAssertNotEqual(article.title, "Default Title")
}
}
func testLinkElementsWithRelativeURLs() {
// This feed has <link> elements that look like this
// <link href="/en/publish/2022/07/01/great-moments-in-document-history-reimagining-the-declaration-of-independence-as-pdf"/>
// and it also has, in the feed declaration
// xml:base="https://blog.adobe.com"
// and so the <link> values should be parsed as (for example):
// https://blog.adobe.com/en/publish/2022/07/01/great-moments-in-document-history-reimagining-the-declaration-of-independence-as-pdf
// Issue: https://github.com/Ranchero-Software/NetNewsWire/issues/3662
let d = parserData("adobe", "atom", "https://blog.adobe.com/feed.xml")
let parsedFeed = try! FeedParser.parse(d)!
for article in parsedFeed.items {
XCTAssertTrue(article.url!.hasPrefix("https://blog.adobe.com/en/publish/20"))
}
}
}

File diff suppressed because one or more lines are too long