2024-09-15 00:40:27 +02:00
|
|
|
|
//
|
|
|
|
|
// HTMLEntityDecoder.swift
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// Created by Brent Simmons on 9/14/24.
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
import Foundation
|
|
|
|
|
|
|
|
|
|
public final class HTMLEntityDecoder {
|
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
static let ampersandCharacter = Character("&")
|
|
|
|
|
|
|
|
|
|
public static func decodedString(_ encodedString: String) -> String {
|
2024-09-15 00:40:27 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
let scanner = EntityScanner(string: encodedString)
|
2024-09-15 00:40:27 +02:00
|
|
|
|
var result = ""
|
|
|
|
|
var didDecodeAtLeastOneEntity = false
|
|
|
|
|
|
|
|
|
|
while true {
|
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
let scannedString = scanner.scanUpTo(Self.ampersandCharacter)
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if !scannedString.isEmpty {
|
2024-09-15 00:40:27 +02:00
|
|
|
|
result.append(scannedString)
|
|
|
|
|
}
|
|
|
|
|
if scanner.isAtEnd {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let savedScanLocation = scanner.scanLocation
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if let decodedEntity = scanner.scanEntityValue() {
|
2024-09-15 00:40:27 +02:00
|
|
|
|
result.append(decodedEntity)
|
|
|
|
|
didDecodeAtLeastOneEntity = true
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
result.append("&")
|
|
|
|
|
scanner.scanLocation = savedScanLocation + 1
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-15 00:40:27 +02:00
|
|
|
|
if scanner.isAtEnd {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if !didDecodeAtLeastOneEntity { // No entities decoded?
|
2024-09-15 00:40:27 +02:00
|
|
|
|
return encodedString
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
|
|
|
|
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
|
2024-09-16 05:43:45 +02:00
|
|
|
|
final class EntityScanner {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
|
|
|
|
let string: String
|
|
|
|
|
let count: Int
|
|
|
|
|
var scanLocation = 0
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
var isAtEnd: Bool {
|
|
|
|
|
scanLocation >= count
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var currentCharacter: Character? {
|
2024-09-16 06:51:48 +02:00
|
|
|
|
guard !isAtEnd else {
|
2024-09-16 05:43:45 +02:00
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-09-16 06:51:48 +02:00
|
|
|
|
return string.characterAtIntIndex(scanLocation)
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
init(string: String) {
|
|
|
|
|
self.string = string
|
|
|
|
|
self.count = string.count
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
|
2024-09-16 05:43:45 +02:00
|
|
|
|
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
|
|
|
|
|
func scanUpTo(_ characterToFind: Character) -> String {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
var scanned = ""
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
|
|
|
|
while true {
|
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
guard let ch = currentCharacter else {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
scanLocation += 1
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
if ch == characterToFind {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
scanned.append(ch)
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
2024-09-16 05:43:45 +02:00
|
|
|
|
|
|
|
|
|
return scanned
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
static let semicolonCharacter = Character(";")
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
func scanEntityValue() -> String? {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
let initialScanLocation = scanLocation
|
|
|
|
|
let maxEntityLength = 20 // It’s probably smaller, but this is just for sanity.
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 05:43:45 +02:00
|
|
|
|
while true {
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
guard let ch = currentCharacter else {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ch == Self.semicolonCharacter {
|
|
|
|
|
let entityRange = initialScanLocation..<scanLocation
|
|
|
|
|
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
|
|
|
|
|
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
|
|
|
|
|
scanLocation = initialScanLocation + 1
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
scanLocation = initialScanLocation + 1
|
|
|
|
|
return decodedEntity
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
|
2024-09-16 06:51:48 +02:00
|
|
|
|
scanLocation += 1
|
|
|
|
|
if scanLocation - initialScanLocation > maxEntityLength {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if isAtEnd {
|
|
|
|
|
break
|
|
|
|
|
}
|
2024-09-16 05:43:45 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-09-15 23:26:01 +02:00
|
|
|
|
}
|
2024-09-16 06:51:48 +02:00
|
|
|
|
|
|
|
|
|
extension String {
|
|
|
|
|
|
|
|
|
|
func indexForInt(_ i: Int) -> Index? {
|
|
|
|
|
|
|
|
|
|
index(startIndex, offsetBy: i, limitedBy: endIndex)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func characterAtIntIndex(_ i: Int) -> Character? {
|
|
|
|
|
|
|
|
|
|
guard let index = indexForInt(i) else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return self[index]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func substring(intRange: Range<Int>) -> String? {
|
|
|
|
|
|
|
|
|
|
guard let rangeLower = indexForInt(intRange.lowerBound) else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
guard let rangeUpper = indexForInt(intRange.upperBound) else {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return String(self[rangeLower..<rangeUpper])
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private func decodedEntity(_ rawEntity: String) -> String? {
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|