// // HTMLEntityDecoder.swift // // // Created by Brent Simmons on 9/14/24. // import Foundation public final class HTMLEntityDecoder { public static func decodedString(_ encodedString: String) -> String { let scanner = EntityScanner(string: encodedString) var result = "" var didDecodeAtLeastOneEntity = false while true { let scannedString = scanner.scanUpToAmpersand() if !scannedString.isEmpty { result.append(scannedString) } if scanner.isAtEnd { break } let savedScanLocation = scanner.scanLocation if let decodedEntity = scanner.scanEntityValue() { result.append(decodedEntity) didDecodeAtLeastOneEntity = true } else { result.append("&") scanner.scanLocation = savedScanLocation + 1 } if scanner.isAtEnd { break } } if !didDecodeAtLeastOneEntity { // No entities decoded? return encodedString } return result } } /// Purpose-built version of NSScanner, which has deprecated the parts we want to use. final class EntityScanner { let string: String let count: Int var scanLocation = 0 var isAtEnd: Bool { scanLocation >= count } var currentCharacter: Character? { guard !isAtEnd else { return nil } return string.characterAtIntIndex(scanLocation) } init(string: String) { self.string = string self.count = string.count } static let ampersandCharacter = Character("&") /// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`. /// - Returns: the scanned portion before `characterToFind`. May be empty string. func scanUpToAmpersand() -> String { let characterToFind = Self.ampersandCharacter var scanned = "" while true { guard let ch = currentCharacter else { break } scanLocation += 1 if ch == characterToFind { break } else { scanned.append(ch) } } return scanned } static let semicolonCharacter = Character(";") func scanEntityValue() -> String? { let initialScanLocation = scanLocation let maxEntityLength = 20 // It’s probably smaller, but this is just for sanity. while true { guard let ch = currentCharacter else { break } if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) { break } if ch == Self.semicolonCharacter { let entityRange = initialScanLocation.. maxEntityLength { break } if isAtEnd { break } } return nil } } extension String { func indexForInt(_ i: Int) -> Index? { index(startIndex, offsetBy: i, limitedBy: endIndex) } func characterAtIntIndex(_ i: Int) -> Character? { guard let index = indexForInt(i) else { return nil } return self[index] } func substring(intRange: Range) -> String? { guard let rangeLower = indexForInt(intRange.lowerBound) else { return nil } guard let rangeUpper = indexForInt(intRange.upperBound) else { return nil } return String(self[rangeLower.. String? { var s = rawEntity if s.hasPrefix("&") { s.removeFirst() } if s.hasSuffix(";") { s.removeLast() } if let decodedEntity = entitiesDictionary[s] { return decodedEntity } if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex let scanner = Scanner(string: s) scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX") var hexValue: UInt64 = 0 if scanner.scanHexInt64(&hexValue) { return stringWithValue(UInt32(hexValue)) } return nil } else if s.hasPrefix("#") { s.removeFirst() guard let value = UInt32(s), value >= 1 else { return nil } return stringWithValue(value) } return nil } private func stringWithValue(_ value: UInt32) -> String? { // From WebCore's HTMLEntityParser let windowsLatin1ExtensionArray: [UInt32] = [ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F ] var modifiedValue = value if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160 modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)] } modifiedValue = CFSwapInt32HostToLittle(modifiedValue) let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue)) return String(data: data, encoding: .utf32LittleEndian) } private let entitiesDictionary = [ "AElig": "Æ", "Aacute": "Á", "Acirc": "Â", "Agrave": "À", "Aring": "Å", "Atilde": "Ã", "Auml": "Ä", "Ccedil": "Ç", "Dstrok": "Ð", "ETH": "Ð", "Eacute": "É", "Ecirc": "Ê", "Egrave": "È", "Euml": "Ë", "Iacute": "Í", "Icirc": "Î", "Igrave": "Ì", "Iuml": "Ï", "Ntilde": "Ñ", "Oacute": "Ó", "Ocirc": "Ô", "Ograve": "Ò", "Oslash": "Ø", "Otilde": "Õ", "Ouml": "Ö", "Pi": "Π", "THORN": "Þ", "Uacute": "Ú", "Ucirc": "Û", "Ugrave": "Ù", "Uuml": "Ü", "Yacute": "Y", "aacute": "á", "acirc": "â", "acute": "´", "aelig": "æ", "agrave": "à", "amp": "&", "apos": "'", "aring": "å", "atilde": "ã", "auml": "ä", "brkbar": "¦", "brvbar": "¦", "ccedil": "ç", "cedil": "¸", "cent": "¢", "copy": "©", "curren": "¤", "deg": "°", "die": "¨", "divide": "÷", "eacute": "é", "ecirc": "ê", "egrave": "è", "eth": "ð", "euml": "ë", "euro": "€", "frac12": "½", "frac14": "¼", "frac34": "¾", "gt": ">", "hearts": "♥", "hellip": "…", "iacute": "í", "icirc": "î", "iexcl": "¡", "igrave": "ì", "iquest": "¿", "iuml": "ï", "laquo": "«", "ldquo": "“", "lsquo": "‘", "lt": "<", "macr": "¯", "mdash": "—", "micro": "µ", "middot": "·", "ndash": "–", "not": "¬", "ntilde": "ñ", "oacute": "ó", "ocirc": "ô", "ograve": "ò", "ordf": "ª", "ordm": "º", "oslash": "ø", "otilde": "õ", "ouml": "ö", "para": "¶", "pi": "π", "plusmn": "±", "pound": "£", "quot": "\"", "raquo": "»", "rdquo": "”", "reg": "®", "rsquo": "’", "sect": "§", "shy": stringWithValue(173), "sup1": "¹", "sup2": "²", "sup3": "³", "szlig": "ß", "thorn": "þ", "times": "×", "trade": "™", "uacute": "ú", "ucirc": "û", "ugrave": "ù", "uml": "¨", "uuml": "ü", "yacute": "y", "yen": "¥", "yuml": "ÿ", "infin": "∞", "nbsp": stringWithValue(160) ]