NetNewsWire/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift

173 lines
3.4 KiB
Swift
Raw Normal View History

2024-09-15 00:40:27 +02:00
//
// HTMLEntityDecoder.swift
//
//
// Created by Brent Simmons on 9/14/24.
//
import Foundation
public final class HTMLEntityDecoder {
static let ampersandCharacter = Character("&")
public static func decodedString(_ encodedString: String) -> String {
2024-09-15 00:40:27 +02:00
let scanner = EntityScanner(string: encodedString)
2024-09-15 00:40:27 +02:00
var result = ""
var didDecodeAtLeastOneEntity = false
while true {
let scannedString = scanner.scanUpTo(Self.ampersandCharacter)
if !scannedString.isEmpty {
2024-09-15 00:40:27 +02:00
result.append(scannedString)
}
if scanner.isAtEnd {
break
}
let savedScanLocation = scanner.scanLocation
if let decodedEntity = scanner.scanEntityValue() {
2024-09-15 00:40:27 +02:00
result.append(decodedEntity)
didDecodeAtLeastOneEntity = true
}
else {
result.append("&")
scanner.scanLocation = savedScanLocation + 1
}
2024-09-15 23:26:01 +02:00
2024-09-15 00:40:27 +02:00
if scanner.isAtEnd {
break
}
}
if !didDecodeAtLeastOneEntity { // No entities decoded?
2024-09-15 00:40:27 +02:00
return encodedString
}
return result
}
}
2024-09-15 23:26:01 +02:00
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
final class EntityScanner {
2024-09-15 23:26:01 +02:00
let string: String
let count: Int
var scanLocation = 0
var isAtEnd: Bool {
scanLocation >= count
}
var currentCharacter: Character? {
guard !isAtEnd else {
return nil
}
return string.characterAtIntIndex(scanLocation)
2024-09-15 23:26:01 +02:00
}
init(string: String) {
self.string = string
self.count = string.count
}
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
func scanUpTo(_ characterToFind: Character) -> String {
2024-09-15 23:26:01 +02:00
var scanned = ""
2024-09-15 23:26:01 +02:00
while true {
guard let ch = currentCharacter else {
break
}
scanLocation += 1
2024-09-15 23:26:01 +02:00
if ch == characterToFind {
break
}
else {
scanned.append(ch)
}
2024-09-15 23:26:01 +02:00
}
return scanned
2024-09-15 23:26:01 +02:00
}
static let semicolonCharacter = Character(";")
2024-09-15 23:26:01 +02:00
func scanEntityValue() -> String? {
2024-09-15 23:26:01 +02:00
let initialScanLocation = scanLocation
let maxEntityLength = 20 // Its probably smaller, but this is just for sanity.
2024-09-15 23:26:01 +02:00
while true {
2024-09-15 23:26:01 +02:00
guard let ch = currentCharacter else {
break
}
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
break
}
if ch == Self.semicolonCharacter {
let entityRange = initialScanLocation..<scanLocation
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
scanLocation = initialScanLocation + 1
return nil
}
scanLocation = initialScanLocation + 1
return decodedEntity
}
2024-09-15 23:26:01 +02:00
scanLocation += 1
if scanLocation - initialScanLocation > maxEntityLength {
break
}
if isAtEnd {
break
}
}
return nil
}
2024-09-15 23:26:01 +02:00
}
extension String {
func indexForInt(_ i: Int) -> Index? {
index(startIndex, offsetBy: i, limitedBy: endIndex)
}
func characterAtIntIndex(_ i: Int) -> Character? {
guard let index = indexForInt(i) else {
return nil
}
return self[index]
}
func substring(intRange: Range<Int>) -> String? {
guard let rangeLower = indexForInt(intRange.lowerBound) else {
return nil
}
guard let rangeUpper = indexForInt(intRange.upperBound) else {
return nil
}
return String(self[rangeLower..<rangeUpper])
}
}
private func decodedEntity(_ rawEntity: String) -> String? {
return nil
}