From 3e6e843dc826e52fef00b5e32736077513e7bc35 Mon Sep 17 00:00:00 2001 From: Brent Simmons Date: Sun, 22 Sep 2024 21:40:52 -0700 Subject: [PATCH] Create first draft of HTMLMetadata. --- .../Sources/HTMLParser/HTMLMetadata.swift | 392 ++++++++++++++++++ .../Sources/{SAX => HTMLParser}/HTMLTag.swift | 6 +- .../Tests/ParserTests/HTMLLinkTests.swift | 1 - 3 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 Modules/Parser/Sources/HTMLParser/HTMLMetadata.swift rename Modules/Parser/Sources/{SAX => HTMLParser}/HTMLTag.swift (63%) diff --git a/Modules/Parser/Sources/HTMLParser/HTMLMetadata.swift b/Modules/Parser/Sources/HTMLParser/HTMLMetadata.swift new file mode 100644 index 000000000..0e36b93d7 --- /dev/null +++ b/Modules/Parser/Sources/HTMLParser/HTMLMetadata.swift @@ -0,0 +1,392 @@ +// +// HTMLMetadata.swift +// +// +// Created by Brent Simmons on 9/22/24. +// + +import Foundation +import SAX + +public final class HTMLMetadata { + + public let baseURLString: String + public let tags: [HTMLTag] + public let favicons: [HTMLMetadataFavicon]? + public let appleTouchIcons: [HTMLMetadataAppleTouchIcon]? + public let feedLinks: [HTMLMetadataFeedLink]? + public let openGraphProperties: HTMLOpenGraphProperties? + public let twitterProperties: HTMLTwitterProperties? + + init(_ urlString: String, _ tags: [HTMLTag]) { + + self.baseURLString = urlString + self.tags = tags + + self.favicons = Self.resolvedFaviconLinks(urlString, tags) + + if let appleTouchIconTags = Self.appleTouchIconTags(tags) { + self.appleTouchIcons = appleTouchIconTags.map { htmlTag in + HTMLMetadataAppleTouchIcon(urlString, htmlTag) + } + } + else { + self.appleTouchIcons = nil + } + + if let feedLinkTags = Self.feedLinkTags(tags) { + self.feedLinks = feedLinkTags.map { htmlTag in + HTMLMetadataFeedLink(urlString, htmlTag) + } + } + else { + self.feedLinks = nil + } + + self.openGraphProperties = HTMLOpenGraphProperties(urlString, tags) + self.twitterProperties = HTMLTwitterProperties(urlString, tags) + } + + static func resolvedFaviconLinks(_ baseURLString: String, _ tags: [HTMLTag]) -> [HTMLMetadataFavicon]? { + + let linkTags = linkTagsWithMatchingRel("icon") + var seenHrefs = [String]() + + let favicons = linkTags.compactMap { htmlTag in + + let favicon = HTMLMetadataFavicon(baseURLString, htmlTag) + guard let urlString = favicon.urlString else { + return nil + } + guard !seenHrefs.contains(urlString) else { + return nil + } + seenHrefs.append(urlString) + return favicon + } + + return favicons.isEmpty ? nil : favicons + } + + static func appleTouchIconTags(_ tags: [HTMLTag]) -> [HTMLTag]? { + + guard let linkTags = linkTags(tags) else { + return nil + } + + let appleTouchIconTags = tagsMatchingRelValues(["apple-touch-icon", "apple-touch-icon-precomposed"], tags) + return appleTouchIconTags.isEmpty ? nil : appleTouchIconTags + } + + static func feedLinkTags(_ tags: [HTMLTag]) -> [HTMLTag]? { + + let alternateLinkTags = linkTagsWithMatchingRel("alternate", tags) else { + return nil + } + + let feedLinkTags = alternateLinkTags.filter { tag in + + guard let attributes = tag.attributes, let type = attributes.object(forCaseInsensitiveKey: "type"), typeIsFeedType(type) else { + return false + } + guard let urlString = urlString(from: attributes), !urlString.isEmpty else { + return false + } + + return true + } + + return feedLinkTags.isEmpty ? nil : feedLinkTags + } + + static func typeIsFeedType(_ type: String) -> Bool { + + let lowerType = type.lowercased() + return lowerType.hasSuffix("/rss+xml") || lowerType.hasSuffix("/atom+xml") || lowerType.hasSuffix("/json") + } + + static func linkTags(_ tags: [HTMLTag]) -> [HTMLTag]? { + + let linkTags = tags.filter { $0.tagType == .link } + return linkTags.isEmpty ? nil : linkTags + } + + static func linkTagsWithMatchingRel(_ valueToMatch: String, _ tags: [HTMLTag]) -> [HTMLTag]? { + + // Case-insensitive; matches a whitespace-delimited word + + guard let linkTags = linkTags(tags) else { + return nil + } + + let tagsWithURLString = linkTags.filter { tag in + guard let urlString = urlStringFromDictionary(tag.attributes), !urlString.isEmpty else { + return false + } + return true + } + if tagsWithURLString.isEmpty { + return nil + } + + let matchingTags = tagsMatchingRelValues([valueToMatch], tagsWithURLString) + return matchingTags.isEmpty ? nil : matchingTags + } + + static func tagsMatchingRelValues(_ valuesToMatch: [String], _ tags: [HTMLTag]) -> [HTMLTag]? { + + let lowerValuesToMatch = valuesToMatch.map { $0.lowercased() } + + let matchingTags: [HTMLTag] = { + + tags.filter { tag in + + guard let relValue = relValue(tag.attributes) else { + return false + } + + let relValues = relValue.componentsSeparatedByCharactersInSet(.whitespacesAndNewlines) + for oneRelValue in relValues { + let oneLowerRelValue = oneRelValue.lowercased() + + for lowerValueToMatch in lowerValuesToMatch { + if lowerValueToMatch == oneLowerRelValue { + return true + } + } + } + + return false + } + } + + return matchingTags.isEmpty ? nil : matchingTags + } +} + +public final class HTMLMetadataAppleTouchIcon { + + public let rel: String? + public let sizes: String? + public let size: CGSize? + public let urlString: String? // Absolute + + init(_ urlString: String, _ tag: HTMLTag) { + + guard let attributes = tag.attributes else { + self.rel = nil + self.sizes = nil + self.size = nil + self.urlString = nil + return + } + + self.rel = attributes.object(forCaseInsensitiveKey: "rel") + self.urlString = absoluteURLStringWithDictionary(attributes) + + guard let sizes = attributes.object(forCaseInsensitiveKey: "sizes") else { + self.sizes = nil + self.size = nil + return + } + self.sizes = sizes + + let size: CGSize? = { + let sizeComponents = sizes.components(separatedBy: CharacterSet(charactersIn: "x")) + guard sizeComponents.count == 2 else { + return nil + } + let width = Double(sizeComponents[0]) + let height = Double(sizeComponents[1]) + return CGSize(width: width, height: height) + }() + + self.size = size + } +} + +public final class HTMLMetadataFeedLink { + + public let title: String? + public let type: String? + public let urlString: String? // Absolute + + init(_ urlString: String, _ tag: HTMLTag) { + + guard let attributes = tag.attributes else { + self.title = nil + self.type = nil + self.urlString = nil + return + } + + self.urlString = absoluteURLStringWithDictionary(attributes, baseURLString) + self.title = attributes.object(forCaseInsensitiveKey: "title") + self.type = attributes.object(forCaseInsensitiveKey: "type") + } +} + +public final class HTMLMetadataFavicon { + + public let type: String? + public let urlString: String? + + init(_ urlString: String, _ tag: HTMLTag) { + + guard let attributes = tag.attributes else { + self.type = nil + self.urlString = nil + return + } + + self.urlString = absoluteURLStringWithDictionary(attributes, baseURLString) + self.type = attributes.object(forCaseInsensitiveKey: "type") + } +} + +public final class HTMLOpenGraphProperties { + + // TODO: the rest. At this writing (Nov. 26, 2017) I just care about og:image. + // See http://ogp.me/ + + public let image: HTMLOpenGraphImage? + + init(_ urlString: String, _ tags: [HTMLTag]) { + + self.image = Self.parse(tags) + } +} + +private extension HTMLOpenGraphProperties { + + private static let ogPrefix = "og:" + + struct OGKey { + static let property = "property" + static let content = "content" + } + + struct OGValue { + static let ogImage = "og:image" + static let ogImageURL = "og:image:url" + static let ogImageSecureURL = "og:image:secure_url" + static let ogImageType = "og:image:type" + static let ogImageAlt = "og:image:alt" + static let ogImageWidth = "og:image:width" + static let ogImageHeight = "og:image:height" + } + + static func parse(_ tags: [HTMLTag]) -> [HTMLOpenGraphImage]? { + + let metaTags = tags.filter { $0.tagType == .meta } + if metaTags.isEmpty { + return nil + } + + // HTMLOpenGraphImage properties to fill in. + var url: String? + var secureURL: String? + var mimeType: String? + var width: CGFloat? + var height: CGFloat? + var altText: String? + + for tag in metaTags { + + guard let attributes = tag.attributes else { + continue + } + guard let propertyName = attributes[OGKey.property], propertyName.hasPrefix(ogPrefix) else { + continue + } + guard let content = attributes[OGKey.content] else { + continue + } + + if propertyName == OGValue.ogImage { + url = content + } + else if propertyName == OGValue.ogImageURL { + url = content + } + else if propertyName == OGValue.ogImageSecureURL { + secureURL = content + } + else if propertyName == OGValue.ogImageType { + mimeType = content + } + else if propertyName == OGValue.ogImageAlt { + altText = content + } + else if propertyName == OGValue.ogImageWidth { + width = CGFloat(content) + } + else if propertyName == OGValue.ogImageHeight { + height = CGFloat(content) + } + } + + if url == nil && secureURL == nil && mimeType == nil && width == nil && height == nil && altText == nil { + return nil + } + + return HTMLOpenGraphImage(url: url, secureURL: secureURL, mimeType: mimeType, width: width, height: height, altText: altText) + } +} + +public final class HTMLOpenGraphImage { + + public let url : String? + public let secureURL: String? + public let mimeType: String? + public let width: CGFloat? + public let height: CGFloat? + public let altText: String? + + init(url: String?, secureURL: String?, mimeType: String, width: CGFloat?, height: CGFloat?, altText: String?) { + + self.url = url + self.secureURL = secureURL + self.mimeType = mimeType + self.width = width + self.height = height + self.altText = altText + } +} + +public final class HTMLTwitterProperties { + + public let imageURL: String? // twitter:image:src + + private struct TwitterKey { + static let name = "name" + static let content = "content" + } + + private struct TwitterValue { + static let imageSrc = "twitter:image:src" + } + + init(_ urlString: String, _ tags: [HTMLTag]) { + + let imageURL: String = { + for tag in tags { + guard tag.tagType == .meta else { + continue + } + guard let name = tag.attributes?[TwitterKey.name], name == TwitterValue.imageSrc else { + continue + } + guard let content = tag.attributes?[TwitterKey.content], !content.isEmpty else { + continue + } + return content + } + + return nil + }() + + self.imageURL = imageURL + } +} + diff --git a/Modules/Parser/Sources/SAX/HTMLTag.swift b/Modules/Parser/Sources/HTMLParser/HTMLTag.swift similarity index 63% rename from Modules/Parser/Sources/SAX/HTMLTag.swift rename to Modules/Parser/Sources/HTMLParser/HTMLTag.swift index 1333d9cff..e0bcfad5e 100644 --- a/Modules/Parser/Sources/SAX/HTMLTag.swift +++ b/Modules/Parser/Sources/HTMLParser/HTMLTag.swift @@ -7,6 +7,8 @@ import Foundation +public typealias HTMLTagAttributes = [String: String] + public struct HTMLTag: Sendable { public enum TagType: Sendable { @@ -15,9 +17,9 @@ public struct HTMLTag: Sendable { } public let tagType: TagType - public let attributes: [String: String]? + public let attributes: HTMLTagAttributes? - public init(tagType: TagType, attributes: [String : String]?) { + public init(tagType: TagType, attributes: HTMLTagAttributes?) { self.tagType = tagType self.attributes = attributes } diff --git a/Modules/Parser/Tests/ParserTests/HTMLLinkTests.swift b/Modules/Parser/Tests/ParserTests/HTMLLinkTests.swift index c179b8137..ac3c6f362 100644 --- a/Modules/Parser/Tests/ParserTests/HTMLLinkTests.swift +++ b/Modules/Parser/Tests/ParserTests/HTMLLinkTests.swift @@ -8,7 +8,6 @@ import XCTest import HTMLParser -import SAX import libxml2 class HTMLLinkTests: XCTestCase {