From 20b222f4555122a9872fb09a8a6b59104b9a8ec2 Mon Sep 17 00:00:00 2001 From: Brent Simmons Date: Sat, 21 Sep 2024 21:49:57 -0700 Subject: [PATCH] Create first draft of HTLMLinkParser. --- .../Sources/HTMLParser/HTMLLinkParser.swift | 97 +++++++++++++++++++ .../Parser/Sources/SAX/SAXHTMLParser.swift | 4 +- 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/Modules/Parser/Sources/HTMLParser/HTMLLinkParser.swift b/Modules/Parser/Sources/HTMLParser/HTMLLinkParser.swift index f34cf8b7a..00ea74e97 100644 --- a/Modules/Parser/Sources/HTMLParser/HTMLLinkParser.swift +++ b/Modules/Parser/Sources/HTMLParser/HTMLLinkParser.swift @@ -10,7 +10,104 @@ import SAX public final class HTMLLinkParser { + public private(set) var links = [HTMLLink]() + + private let parserData: ParserData + private let baseURL: URL? + public static func htmlLinks(parserData: ParserData) -> [HTMLLink] { + let parser = HTMLLinkParser(parserData) + parser.parse() + return parser.links + } + + init(_ parserData: ParserData) { + + self.parserData = parserData + self.baseURL = URL(string: parserData.url) + } +} + +private extension HTMLLinkParser { + + func parse() { + + let htmlParser = SAXHTMLParser(delegate: self, data: parserData.data) + htmlParser.parse() + } +} + +private extension HTMLLinkParser: SAXHTMLParserDelegate { + + var currentLink: HTMLLink? { + links.last + } + + struct HTMLAttributeName { + let href = "href" + let title = "title" + } + + func title(_ attributesDictionary: HTMLAttributesDictionary) -> String? { + + attributesDictionary.object(object(forCaseInsensitiveKey: HTMLAttributeName.title)) + } + + func urlString(_ attributesDictionary: HTMLAttributesDictionary) -> String? { + + guard let href = attributesDictionary.object(forCaseInsensitiveKey: HTMLAttributeName.href) else { + return nil + } + + guard let baseURL, let absoluteURL = URL(string: href, relativeTo: baseURL) else { + assertionFailure("Expected to create URL") + return nil + } + + return absoluteURL.absoluteString + } + + func handleLinkAttributes(_ attributesDictionary: HTMLAttributesDictionary) { + + guard let currentLink else { + assertionFailure("currentLink must not be nil") + return + } + + link.urlString = urlString(attributesDictionary) + link.title = title(attributesDictionary) + } + + struct HTMLName { + static let a = "a".utf8CString + } + + func saxHTMLParser(_ saxHTMLParser: SAXHTMLParser, startElement name: XMLPointer, attributes: UnsafePointer?) { + + guard SAXEqualTags(name, HTMLName.a) else { + return + } + + let link = HTMLLink() + links.append(link) + + if let attributesDictionary = saxHTMLParser.attributesDictionary(attributes) { + handleLinkAttributes(attributesDictionary) + } + + saxHTMLParser.beginStoringCharacters() + } + + func saxHTMLParser(_ saxHTMLParser: SAXHTMLParser, endElement name: XMLPointer) { + + guard SAXEqualTags(name, HTMLName.a) else { + return + } + currentLink.text = saxHTMLParser.currentStringWithTrimmedWhitespace + } + + func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int) { + // Nothing needed. } } diff --git a/Modules/Parser/Sources/SAX/SAXHTMLParser.swift b/Modules/Parser/Sources/SAX/SAXHTMLParser.swift index 5e884d58a..6c845d5f1 100644 --- a/Modules/Parser/Sources/SAX/SAXHTMLParser.swift +++ b/Modules/Parser/Sources/SAX/SAXHTMLParser.swift @@ -8,7 +8,7 @@ import Foundation import libxml2 -protocol SAXHTMLParserDelegate: AnyObject { +public protocol SAXHTMLParserDelegate: AnyObject { func saxHTMLParser(_: SAXHTMLParser, startElement: XMLPointer, attributes: UnsafePointer?) @@ -18,7 +18,7 @@ protocol SAXHTMLParserDelegate: AnyObject { func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int) } -final class SAXHTMLParser { +public final class SAXHTMLParser { fileprivate let delegate: SAXHTMLParserDelegate