2024-09-15 00:40:27 +02:00
|
|
|
|
//
|
|
|
|
|
// HTMLEntityDecoder.swift
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// Created by Brent Simmons on 9/14/24.
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
import Foundation
|
|
|
|
|
|
|
|
|
|
public final class HTMLEntityDecoder {
|
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
public static func decodedString(_ encodedString: String) -> String {
|
2024-09-15 00:40:27 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
let scanner = EntityScanner(string: encodedString)
|
2024-09-15 00:40:27 +02:00
|
|
|
|
var result = ""
|
|
|
|
|
var didDecodeAtLeastOneEntity = false
|
|
|
|
|
|
|
|
|
|
while true {
|
|
|
|
|
|
2024-09-17 06:56:55 +02:00
|
|
|
|
let scannedString = scanner.scanUpToAmpersand()
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if !scannedString.isEmpty {
|
2024-09-15 00:40:27 +02:00
|
|
|
|
result.append(scannedString)
|
|
|
|
|
}
|
|
|
|
|
if scanner.isAtEnd {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let savedScanLocation = scanner.scanLocation
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if let decodedEntity = scanner.scanEntityValue() {
|
2024-09-15 00:40:27 +02:00
|
|
|
|
result.append(decodedEntity)
|
|
|
|
|
didDecodeAtLeastOneEntity = true
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
result.append("&")
|
|
|
|
|
scanner.scanLocation = savedScanLocation + 1
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-15 00:40:27 +02:00
|
|
|
|
if scanner.isAtEnd {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if !didDecodeAtLeastOneEntity { // No entities decoded?
|
2024-09-15 00:40:27 +02:00
|
|
|
|
return encodedString
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
|
|
|
|
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
|
2024-09-16 05:43:45 +02:00
|
|
|
|
final class EntityScanner {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
|
|
|
|
let string: String
|
|
|
|
|
let count: Int
|
|
|
|
|
var scanLocation = 0
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
var isAtEnd: Bool {
|
|
|
|
|
scanLocation >= count
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var currentCharacter: Character? {
|
2024-09-16 06:51:48 +02:00
|
|
|
|
guard !isAtEnd else {
|
2024-09-16 05:43:45 +02:00
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-09-16 06:51:48 +02:00
|
|
|
|
return string.characterAtIntIndex(scanLocation)
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
init(string: String) {
|
|
|
|
|
self.string = string
|
|
|
|
|
self.count = string.count
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-17 06:56:55 +02:00
|
|
|
|
static let ampersandCharacter = Character("&")
|
|
|
|
|
|
2024-09-15 23:26:01 +02:00
|
|
|
|
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
|
2024-09-16 05:43:45 +02:00
|
|
|
|
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
|
2024-09-17 06:56:55 +02:00
|
|
|
|
func scanUpToAmpersand() -> String {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-17 06:56:55 +02:00
|
|
|
|
let characterToFind = Self.ampersandCharacter
|
2024-09-16 05:43:45 +02:00
|
|
|
|
var scanned = ""
|
2024-09-17 06:56:55 +02:00
|
|
|
|
|
2024-09-15 23:26:01 +02:00
|
|
|
|
while true {
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
guard let ch = currentCharacter else {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
scanLocation += 1
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if ch == characterToFind {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
scanned.append(ch)
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
2024-09-16 05:43:45 +02:00
|
|
|
|
|
|
|
|
|
return scanned
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
static let semicolonCharacter = Character(";")
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
func scanEntityValue() -> String? {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
let initialScanLocation = scanLocation
|
|
|
|
|
let maxEntityLength = 20 // It’s probably smaller, but this is just for sanity.
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
while true {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
guard let ch = currentCharacter else {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ch == Self.semicolonCharacter {
|
|
|
|
|
let entityRange = initialScanLocation..<scanLocation
|
|
|
|
|
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
|
|
|
|
|
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
|
|
|
|
|
scanLocation = initialScanLocation + 1
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
scanLocation = initialScanLocation + 1
|
|
|
|
|
return decodedEntity
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
scanLocation += 1
|
|
|
|
|
if scanLocation - initialScanLocation > maxEntityLength {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if isAtEnd {
|
|
|
|
|
break
|
|
|
|
|
}
|
2024-09-16 05:43:45 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
2024-09-16 06:51:48 +02:00
|
|
|
|
|
|
|
|
|
extension String {
|
|
|
|
|
|
|
|
|
|
func indexForInt(_ i: Int) -> Index? {
|
|
|
|
|
|
|
|
|
|
index(startIndex, offsetBy: i, limitedBy: endIndex)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func characterAtIntIndex(_ i: Int) -> Character? {
|
|
|
|
|
|
|
|
|
|
guard let index = indexForInt(i) else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return self[index]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func substring(intRange: Range<Int>) -> String? {
|
|
|
|
|
|
|
|
|
|
guard let rangeLower = indexForInt(intRange.lowerBound) else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
guard let rangeUpper = indexForInt(intRange.upperBound) else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return String(self[rangeLower..<rangeUpper])
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-17 06:56:55 +02:00
|
|
|
|
/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
|
2024-09-16 06:51:48 +02:00
|
|
|
|
private func decodedEntity(_ rawEntity: String) -> String? {
|
|
|
|
|
|
2024-09-17 06:56:55 +02:00
|
|
|
|
var s = rawEntity
|
|
|
|
|
|
|
|
|
|
if s.hasPrefix("&") {
|
|
|
|
|
s.removeFirst()
|
|
|
|
|
}
|
|
|
|
|
if s.hasSuffix(";") {
|
|
|
|
|
s.removeLast()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if let decodedEntity = entitiesDictionary[s] {
|
|
|
|
|
return decodedEntity
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
|
|
|
|
|
let scanner = Scanner(string: s)
|
|
|
|
|
scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
|
|
|
|
|
var hexValue: UInt64 = 0
|
|
|
|
|
if scanner.scanHexInt64(&hexValue) {
|
|
|
|
|
return stringWithValue(UInt32(hexValue))
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
else if s.hasPrefix("#") {
|
|
|
|
|
s.removeFirst()
|
|
|
|
|
guard let value = UInt32(s), value >= 1 else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
return stringWithValue(value)
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-09-17 06:56:55 +02:00
|
|
|
|
|
|
|
|
|
private func stringWithValue(_ value: UInt32) -> String? {
|
|
|
|
|
|
|
|
|
|
// From WebCore's HTMLEntityParser
|
|
|
|
|
let windowsLatin1ExtensionArray: [UInt32] = [
|
|
|
|
|
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
|
|
|
|
|
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
|
|
|
|
|
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
|
|
|
|
|
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
var modifiedValue = value
|
|
|
|
|
|
|
|
|
|
if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
|
|
|
|
|
modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
|
|
|
|
|
|
|
|
|
|
let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
|
|
|
|
|
|
|
|
|
|
return String(data: data, encoding: .utf32LittleEndian)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private let entitiesDictionary =
|
|
|
|
|
[
|
|
|
|
|
"AElig": "Æ",
|
|
|
|
|
"Aacute": "Á",
|
|
|
|
|
"Acirc": "Â",
|
|
|
|
|
"Agrave": "À",
|
|
|
|
|
"Aring": "Å",
|
|
|
|
|
"Atilde": "Ã",
|
|
|
|
|
"Auml": "Ä",
|
|
|
|
|
"Ccedil": "Ç",
|
|
|
|
|
"Dstrok": "Ð",
|
|
|
|
|
"ETH": "Ð",
|
|
|
|
|
"Eacute": "É",
|
|
|
|
|
"Ecirc": "Ê",
|
|
|
|
|
"Egrave": "È",
|
|
|
|
|
"Euml": "Ë",
|
|
|
|
|
"Iacute": "Í",
|
|
|
|
|
"Icirc": "Î",
|
|
|
|
|
"Igrave": "Ì",
|
|
|
|
|
"Iuml": "Ï",
|
|
|
|
|
"Ntilde": "Ñ",
|
|
|
|
|
"Oacute": "Ó",
|
|
|
|
|
"Ocirc": "Ô",
|
|
|
|
|
"Ograve": "Ò",
|
|
|
|
|
"Oslash": "Ø",
|
|
|
|
|
"Otilde": "Õ",
|
|
|
|
|
"Ouml": "Ö",
|
|
|
|
|
"Pi": "Π",
|
|
|
|
|
"THORN": "Þ",
|
|
|
|
|
"Uacute": "Ú",
|
|
|
|
|
"Ucirc": "Û",
|
|
|
|
|
"Ugrave": "Ù",
|
|
|
|
|
"Uuml": "Ü",
|
|
|
|
|
"Yacute": "Y",
|
|
|
|
|
"aacute": "á",
|
|
|
|
|
"acirc": "â",
|
|
|
|
|
"acute": "´",
|
|
|
|
|
"aelig": "æ",
|
|
|
|
|
"agrave": "à",
|
|
|
|
|
"amp": "&",
|
|
|
|
|
"apos": "'",
|
|
|
|
|
"aring": "å",
|
|
|
|
|
"atilde": "ã",
|
|
|
|
|
"auml": "ä",
|
|
|
|
|
"brkbar": "¦",
|
|
|
|
|
"brvbar": "¦",
|
|
|
|
|
"ccedil": "ç",
|
|
|
|
|
"cedil": "¸",
|
|
|
|
|
"cent": "¢",
|
|
|
|
|
"copy": "©",
|
|
|
|
|
"curren": "¤",
|
|
|
|
|
"deg": "°",
|
|
|
|
|
"die": "¨",
|
|
|
|
|
"divide": "÷",
|
|
|
|
|
"eacute": "é",
|
|
|
|
|
"ecirc": "ê",
|
|
|
|
|
"egrave": "è",
|
|
|
|
|
"eth": "ð",
|
|
|
|
|
"euml": "ë",
|
|
|
|
|
"euro": "€",
|
|
|
|
|
"frac12": "½",
|
|
|
|
|
"frac14": "¼",
|
|
|
|
|
"frac34": "¾",
|
|
|
|
|
"gt": ">",
|
|
|
|
|
"hearts": "♥",
|
|
|
|
|
"hellip": "…",
|
|
|
|
|
"iacute": "í",
|
|
|
|
|
"icirc": "î",
|
|
|
|
|
"iexcl": "¡",
|
|
|
|
|
"igrave": "ì",
|
|
|
|
|
"iquest": "¿",
|
|
|
|
|
"iuml": "ï",
|
|
|
|
|
"laquo": "«",
|
|
|
|
|
"ldquo": "“",
|
|
|
|
|
"lsquo": "‘",
|
|
|
|
|
"lt": "<",
|
|
|
|
|
"macr": "¯",
|
|
|
|
|
"mdash": "—",
|
|
|
|
|
"micro": "µ",
|
|
|
|
|
"middot": "·",
|
|
|
|
|
"ndash": "–",
|
|
|
|
|
"not": "¬",
|
|
|
|
|
"ntilde": "ñ",
|
|
|
|
|
"oacute": "ó",
|
|
|
|
|
"ocirc": "ô",
|
|
|
|
|
"ograve": "ò",
|
|
|
|
|
"ordf": "ª",
|
|
|
|
|
"ordm": "º",
|
|
|
|
|
"oslash": "ø",
|
|
|
|
|
"otilde": "õ",
|
|
|
|
|
"ouml": "ö",
|
|
|
|
|
"para": "¶",
|
|
|
|
|
"pi": "π",
|
|
|
|
|
"plusmn": "±",
|
|
|
|
|
"pound": "£",
|
|
|
|
|
"quot": "\"",
|
|
|
|
|
"raquo": "»",
|
|
|
|
|
"rdquo": "”",
|
|
|
|
|
"reg": "®",
|
|
|
|
|
"rsquo": "’",
|
|
|
|
|
"sect": "§",
|
|
|
|
|
"shy": stringWithValue(173),
|
|
|
|
|
"sup1": "¹",
|
|
|
|
|
"sup2": "²",
|
|
|
|
|
"sup3": "³",
|
|
|
|
|
"szlig": "ß",
|
|
|
|
|
"thorn": "þ",
|
|
|
|
|
"times": "×",
|
|
|
|
|
"trade": "™",
|
|
|
|
|
"uacute": "ú",
|
|
|
|
|
"ucirc": "û",
|
|
|
|
|
"ugrave": "ù",
|
|
|
|
|
"uml": "¨",
|
|
|
|
|
"uuml": "ü",
|
|
|
|
|
"yacute": "y",
|
|
|
|
|
"yen": "¥",
|
|
|
|
|
"yuml": "ÿ",
|
|
|
|
|
"infin": "∞",
|
|
|
|
|
"nbsp": stringWithValue(160)
|
|
|
|
|
]
|