NetNewsWire/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift

350 lines
6.8 KiB
Swift
Raw Normal View History

2024-09-15 00:40:27 +02:00
//
// HTMLEntityDecoder.swift
//
//
// Created by Brent Simmons on 9/14/24.
//
import Foundation
public final class HTMLEntityDecoder {
public static func decodedString(_ encodedString: String) -> String {
2024-09-15 00:40:27 +02:00
let scanner = EntityScanner(string: encodedString)
2024-09-15 00:40:27 +02:00
var result = ""
var didDecodeAtLeastOneEntity = false
while true {
let scannedString = scanner.scanUpToAmpersand()
if !scannedString.isEmpty {
2024-09-15 00:40:27 +02:00
result.append(scannedString)
}
if scanner.isAtEnd {
break
}
let savedScanLocation = scanner.scanLocation
if let decodedEntity = scanner.scanEntityValue() {
2024-09-15 00:40:27 +02:00
result.append(decodedEntity)
didDecodeAtLeastOneEntity = true
}
else {
result.append("&")
scanner.scanLocation = savedScanLocation + 1
}
2024-09-15 23:26:01 +02:00
2024-09-15 00:40:27 +02:00
if scanner.isAtEnd {
break
}
}
if !didDecodeAtLeastOneEntity { // No entities decoded?
2024-09-15 00:40:27 +02:00
return encodedString
}
return result
}
}
2024-09-15 23:26:01 +02:00
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
final class EntityScanner {
2024-09-15 23:26:01 +02:00
let string: String
let count: Int
var scanLocation = 0
var isAtEnd: Bool {
scanLocation >= count
}
var currentCharacter: Character? {
guard !isAtEnd else {
return nil
}
return string.characterAtIntIndex(scanLocation)
2024-09-15 23:26:01 +02:00
}
init(string: String) {
self.string = string
self.count = string.count
}
static let ampersandCharacter = Character("&")
2024-09-15 23:26:01 +02:00
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
func scanUpToAmpersand() -> String {
2024-09-15 23:26:01 +02:00
let characterToFind = Self.ampersandCharacter
var scanned = ""
2024-09-15 23:26:01 +02:00
while true {
guard let ch = currentCharacter else {
break
}
scanLocation += 1
2024-09-15 23:26:01 +02:00
if ch == characterToFind {
break
}
else {
scanned.append(ch)
}
2024-09-15 23:26:01 +02:00
}
return scanned
2024-09-15 23:26:01 +02:00
}
static let semicolonCharacter = Character(";")
2024-09-15 23:26:01 +02:00
func scanEntityValue() -> String? {
2024-09-15 23:26:01 +02:00
let initialScanLocation = scanLocation
let maxEntityLength = 20 // Its probably smaller, but this is just for sanity.
2024-09-15 23:26:01 +02:00
while true {
2024-09-15 23:26:01 +02:00
guard let ch = currentCharacter else {
break
}
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
break
}
if ch == Self.semicolonCharacter {
let entityRange = initialScanLocation..<scanLocation
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
scanLocation = initialScanLocation + 1
return nil
}
scanLocation = initialScanLocation + 1
return decodedEntity
}
2024-09-15 23:26:01 +02:00
scanLocation += 1
if scanLocation - initialScanLocation > maxEntityLength {
break
}
if isAtEnd {
break
}
}
return nil
}
2024-09-15 23:26:01 +02:00
}
extension String {
func indexForInt(_ i: Int) -> Index? {
index(startIndex, offsetBy: i, limitedBy: endIndex)
}
func characterAtIntIndex(_ i: Int) -> Character? {
guard let index = indexForInt(i) else {
return nil
}
return self[index]
}
func substring(intRange: Range<Int>) -> String? {
guard let rangeLower = indexForInt(intRange.lowerBound) else {
return nil
}
guard let rangeUpper = indexForInt(intRange.upperBound) else {
return nil
}
return String(self[rangeLower..<rangeUpper])
}
}
/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
private func decodedEntity(_ rawEntity: String) -> String? {
var s = rawEntity
if s.hasPrefix("&") {
s.removeFirst()
}
if s.hasSuffix(";") {
s.removeLast()
}
if let decodedEntity = entitiesDictionary[s] {
return decodedEntity
}
if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
let scanner = Scanner(string: s)
scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
var hexValue: UInt64 = 0
if scanner.scanHexInt64(&hexValue) {
return stringWithValue(UInt32(hexValue))
}
return nil
}
else if s.hasPrefix("#") {
s.removeFirst()
guard let value = UInt32(s), value >= 1 else {
return nil
}
return stringWithValue(value)
}
return nil
}
private func stringWithValue(_ value: UInt32) -> String? {
// From WebCore's HTMLEntityParser
let windowsLatin1ExtensionArray: [UInt32] = [
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
]
var modifiedValue = value
if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
}
modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
return String(data: data, encoding: .utf32LittleEndian)
}
private let entitiesDictionary =
[
"AElig": "Æ",
"Aacute": "Á",
"Acirc": "Â",
"Agrave": "À",
"Aring": "Å",
"Atilde": "Ã",
"Auml": "Ä",
"Ccedil": "Ç",
"Dstrok": "Ð",
"ETH": "Ð",
"Eacute": "É",
"Ecirc": "Ê",
"Egrave": "È",
"Euml": "Ë",
"Iacute": "Í",
"Icirc": "Î",
"Igrave": "Ì",
"Iuml": "Ï",
"Ntilde": "Ñ",
"Oacute": "Ó",
"Ocirc": "Ô",
"Ograve": "Ò",
"Oslash": "Ø",
"Otilde": "Õ",
"Ouml": "Ö",
"Pi": "Π",
"THORN": "Þ",
"Uacute": "Ú",
"Ucirc": "Û",
"Ugrave": "Ù",
"Uuml": "Ü",
"Yacute": "Y",
"aacute": "á",
"acirc": "â",
"acute": "´",
"aelig": "æ",
"agrave": "à",
"amp": "&",
"apos": "'",
"aring": "å",
"atilde": "ã",
"auml": "ä",
"brkbar": "¦",
"brvbar": "¦",
"ccedil": "ç",
"cedil": "¸",
"cent": "¢",
"copy": "©",
"curren": "¤",
"deg": "°",
"die": "¨",
"divide": "÷",
"eacute": "é",
"ecirc": "ê",
"egrave": "è",
"eth": "ð",
"euml": "ë",
"euro": "",
"frac12": "½",
"frac14": "¼",
"frac34": "¾",
"gt": ">",
"hearts": "",
"hellip": "",
"iacute": "í",
"icirc": "î",
"iexcl": "¡",
"igrave": "ì",
"iquest": "¿",
"iuml": "ï",
"laquo": "«",
"ldquo": "",
"lsquo": "",
"lt": "<",
"macr": "¯",
"mdash": "",
"micro": "µ",
"middot": "·",
"ndash": "",
"not": "¬",
"ntilde": "ñ",
"oacute": "ó",
"ocirc": "ô",
"ograve": "ò",
"ordf": "ª",
"ordm": "º",
"oslash": "ø",
"otilde": "õ",
"ouml": "ö",
"para": "",
"pi": "π",
"plusmn": "±",
"pound": "£",
"quot": "\"",
"raquo": "»",
"rdquo": "",
"reg": "®",
"rsquo": "",
"sect": "§",
"shy": stringWithValue(173),
"sup1": "¹",
"sup2": "²",
"sup3": "³",
"szlig": "ß",
"thorn": "þ",
"times": "×",
"trade": "",
"uacute": "ú",
"ucirc": "û",
"ugrave": "ù",
"uml": "¨",
"uuml": "ü",
"yacute": "y",
"yen": "¥",
"yuml": "ÿ",
"infin": "",
"nbsp": stringWithValue(160)
]