NetNewsWire/Modules/Parser/Sources/SAX/SAXHTMLParser.swift

202 lines
4.6 KiB
Swift
Raw Normal View History

2024-08-26 20:53:57 -07:00
//
2024-09-21 11:47:07 -07:00
// SAXHTMLParser.swift
//
//
// Created by Brent Simmons on 8/26/24.
//
import Foundation
import libxml2
2024-09-21 21:49:57 -07:00
public protocol SAXHTMLParserDelegate: AnyObject {
2024-09-21 11:47:07 -07:00
2024-09-21 12:01:17 -07:00
func saxHTMLParser(_: SAXHTMLParser, startElement: XMLPointer, attributes: UnsafePointer<XMLPointer?>?)
2024-09-21 11:47:07 -07:00
2024-09-21 12:01:17 -07:00
func saxHTMLParser(_: SAXHTMLParser, endElement: XMLPointer)
2024-09-21 11:47:07 -07:00
// Length is guaranteed to be greater than 0.
2024-09-21 12:01:17 -07:00
func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int)
2024-09-21 11:47:07 -07:00
}
2024-09-21 21:49:57 -07:00
public final class SAXHTMLParser {
2024-09-21 11:47:07 -07:00
fileprivate let delegate: SAXHTMLParserDelegate
public var currentCharacters: Data? { // UTF-8 encoded
guard storingCharacters else {
return nil
}
return characters
}
// Conveniences to get string version of currentCharacters
public var currentString: String? {
guard let d = currentCharacters, !d.isEmpty else {
return nil
}
return String(data: d, encoding: .utf8)
}
public var currentStringWithTrimmedWhitespace: String? {
guard let s = currentString else {
return nil
}
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
}
private var data: Data
private var storingCharacters = false
private var characters = Data()
public init(delegate: SAXHTMLParserDelegate, data: Data) {
self.delegate = delegate
self.data = data
}
public func parse() {
guard !data.isEmpty else {
return
}
data.withUnsafeBytes { bufferPointer in
2024-09-22 11:33:37 -07:00
guard let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress else {
2024-09-21 11:47:07 -07:00
return
}
let characterEncoding = xmlDetectCharEncoding(bytes, Int32(data.count))
let context = htmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil, characterEncoding)
2024-09-22 11:33:37 -07:00
htmlCtxtUseOptions(context, Int32(HTML_PARSE_RECOVER.rawValue | HTML_PARSE_NONET.rawValue | HTML_PARSE_COMPACT.rawValue | HTML_PARSE_NOERROR.rawValue | HTML_PARSE_NOWARNING.rawValue))
2024-09-21 11:47:07 -07:00
htmlParseChunk(context, bytes, Int32(data.count), 0)
htmlParseChunk(context, nil, 0, 1)
htmlFreeParserCtxt(context)
}
}
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
public func beginStoringCharacters() {
storingCharacters = true
characters.count = 0
}
public func endStoringCharacters() {
storingCharacters = false
characters.count = 0
}
public typealias HTMLAttributesDictionary = [String: String]
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?) -> HTMLAttributesDictionary? {
guard let attributes else {
return nil
}
var dictionary = [String: String]()
var ix = 0
var currentKey: String? = nil
while true {
let oneAttribute = attributes[ix]
ix += 1
if currentKey == nil && oneAttribute == nil {
break
}
if currentKey == nil {
if let oneAttribute {
currentKey = String(cString: oneAttribute)
}
} else {
let value: String?
if let oneAttribute {
value = String(cString: oneAttribute)
} else {
value = nil
}
dictionary[currentKey!] = value ?? ""
currentKey = nil
}
}
return dictionary
}
}
private extension SAXHTMLParser {
2024-09-21 12:01:17 -07:00
func charactersFound(_ htmlCharacters: XMLPointer, count: Int) {
2024-09-21 11:47:07 -07:00
if storingCharacters {
2024-09-21 12:01:17 -07:00
characters.append(htmlCharacters, count: count)
2024-09-21 11:47:07 -07:00
}
2024-09-21 12:01:17 -07:00
delegate.saxHTMLParser(self, charactersFound: htmlCharacters, count: count)
2024-09-21 11:47:07 -07:00
}
func startElement(_ name: XMLPointer, attributes: UnsafePointer<XMLPointer?>?) {
delegate.saxHTMLParser(self, startElement: name, attributes: attributes)
}
func endElement(_ name: XMLPointer) {
delegate.saxHTMLParser(self, endElement: name)
endStoringCharacters()
}
}
2024-09-22 11:33:37 -07:00
private func parser(from context: UnsafeMutableRawPointer) -> SAXHTMLParser {
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
Unmanaged<SAXHTMLParser>.fromOpaque(context).takeUnretainedValue()
2024-09-21 11:47:07 -07:00
}
2024-09-22 11:33:37 -07:00
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
var handler = htmlSAXHandler()
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
handler.characters = { (context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) in
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
guard let context, let ch, len > 0 else {
return
}
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
let parser = parser(from: context)
parser.charactersFound(ch, count: Int(len))
2024-09-21 11:47:07 -07:00
}
2024-09-22 11:33:37 -07:00
handler.startElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?, attributes: UnsafeMutablePointer<XMLPointer?>?) in
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
guard let context, let name else {
return
}
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
let parser = parser(from: context)
parser.startElement(name, attributes: attributes)
}
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
handler.endElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?) in
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
guard let context, let name else {
return
}
2024-09-21 11:47:07 -07:00
2024-09-22 11:33:37 -07:00
let parser = parser(from: context)
parser.endElement(name)
}
2024-09-21 11:47:07 -07:00
return handler
}()