Rewrite HTMLEntityDecoder so that it’s fast.

This commit is contained in:
Brent Simmons 2024-09-28 22:14:51 -07:00
parent be7dd28b6f
commit ed160986f5
3 changed files with 336 additions and 172 deletions

View File

@ -4,6 +4,19 @@
<dict>
<key>classNames</key>
<dict>
<key>EntityDecodingTests</key>
<dict>
<key>testPerformance()</key>
<dict>
<key>com.apple.XCTPerformanceMetric_WallClockTime</key>
<dict>
<key>baselineAverage</key>
<real>0.002095</real>
<key>baselineIntegrationDisplayName</key>
<string>Local Baseline</string>
</dict>
</dict>
</dict>
<key>RSSParserTests</key>
<dict>
<key>testEMarleyPerformance()</key>

View File

@ -2,206 +2,295 @@
// HTMLEntityDecoder.swift
//
//
// Created by Brent Simmons on 9/14/24.
// Created by Brent Simmons on 9/26/24.
//
import Foundation
public final class HTMLEntityDecoder {
public static func decodedString(_ encodedString: String) -> String {
public static func decodedString(_ encodedString: String) -> String? {
let scanner = EntityScanner(string: encodedString)
var result = ""
var didDecodeAtLeastOneEntity = false
while true {
let scannedString = scanner.scanUpToAmpersand()
if !scannedString.isEmpty {
result.append(scannedString)
}
if scanner.isAtEnd {
break
// If `withContiguousStorageIfAvailable` works, then we can avoid copying memory.
var result: String? = encodedString.utf8.withContiguousStorageIfAvailable { buffer in
return decodedEntities(buffer, &didDecodeAtLeastOneEntity)
}
let savedScanLocation = scanner.scanLocation
if let decodedEntity = scanner.scanEntityValue() {
result.append(decodedEntity)
didDecodeAtLeastOneEntity = true
}
else {
result.append("&")
scanner.scanLocation = savedScanLocation + 1
}
if scanner.isAtEnd {
break
if result == nil {
let d = Data(encodedString.utf8)
result = d.withUnsafeBytes { bytes in
let buffer = bytes.bindMemory(to: UInt8.self)
return decodedEntities(buffer, &didDecodeAtLeastOneEntity)
}
}
if !didDecodeAtLeastOneEntity { // No entities decoded?
return encodedString
}
if let result {
if didDecodeAtLeastOneEntity {
return result
}
return encodedString
}
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
final class EntityScanner {
let string: String
let count: Int
var scanLocation = 0
var isAtEnd: Bool {
scanLocation >= count
}
var currentCharacter: Character? {
guard !isAtEnd else {
assertionFailure("Expected result but got nil.")
return nil
}
return string.characterAtIntIndex(scanLocation)
}
init(string: String) {
self.string = string
self.count = string.count
private let ampersandCharacter = Character("&").asciiValue!
private let numberSignCharacter = Character("#").asciiValue!
private let xCharacter = Character("x").asciiValue!
private let XCharacter = Character("X").asciiValue!
private let semicolonCharacter = Character(";").asciiValue!
private let zeroCharacter = Character("0").asciiValue!
private let nineCharacter = Character("9").asciiValue!
private let aCharacter = Character("a").asciiValue!
private let fCharacter = Character("f").asciiValue!
private let zCharacter = Character("z").asciiValue!
private let ACharacter = Character("A").asciiValue!
private let FCharacter = Character("F").asciiValue!
private let ZCharacter = Character("Z").asciiValue!
private let maxUnicodeNumber = 0x10FFFF
private func decodedEntities(_ sourceBuffer: UnsafeBufferPointer<UInt8>, _ didDecodeAtLeastOneEntity: inout Bool) -> String {
let byteCount = sourceBuffer.count
let resultBufferByteCount = byteCount + 1
// Allocate a destination buffer for the result string. It can be the same size
// as the source string buffer, since decoding HTML entities will only make it smaller.
// Same size plus 1, that is, for null-termination.
let resultBuffer = UnsafeMutableRawPointer.allocate(byteCount: resultBufferByteCount, alignment: MemoryLayout<UInt8>.alignment)
defer {
resultBuffer.deallocate()
}
static let ampersandCharacter = Character("&")
resultBuffer.initializeMemory(as: UInt8.self, repeating: 0, count: resultBufferByteCount)
let result = resultBuffer.assumingMemoryBound(to: UInt8.self)
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
func scanUpToAmpersand() -> String {
var sourceLocation = 0
var resultLocation = 0
let characterToFind = Self.ampersandCharacter
var scanned = ""
while sourceLocation < byteCount {
while true {
let ch = sourceBuffer[sourceLocation]
guard let ch = currentCharacter else {
break
var decodedEntity: String? = nil
if ch == ampersandCharacter {
decodedEntity = decodedEntityValue(sourceBuffer, byteCount, &sourceLocation)
}
scanLocation += 1
if ch == characterToFind {
break
if let decodedEntity {
addDecodedEntity(decodedEntity, result, byteCount, &resultLocation)
didDecodeAtLeastOneEntity = true
sourceLocation += 1
continue
}
result[resultLocation] = ch
resultLocation += 1
sourceLocation += 1
}
let cString = resultBuffer.assumingMemoryBound(to: CChar.self)
return String(cString: cString)
}
private func addDecodedEntity(_ decodedEntity: String, _ result: UnsafeMutablePointer<UInt8>, _ resultByteCount: Int, _ resultLocation: inout Int) {
let utf8Bytes = Array(decodedEntity.utf8)
precondition(resultLocation + utf8Bytes.count <= resultByteCount)
for byte in utf8Bytes {
result[resultLocation] = byte
resultLocation += 1
}
}
private func decodedEntityValue(_ buffer: UnsafeBufferPointer<UInt8>, _ byteCount: Int, _ sourceLocation: inout Int) -> /*[UInt8]?*/ String? {
guard let rawEntity = rawEntityValue(buffer, byteCount, &sourceLocation) else {
return nil
}
return decodedRawEntityValue(rawEntity)
}
private func decodedRawEntityValue(_ rawEntity: ContiguousArray<UInt8>) -> String? {
let key = String(cString: Array(rawEntity))
if let entityString = entitiesDictionary[key] {
return entityString
}
if rawEntity[0] == numberSignCharacter {
if let entityString = decodedNumericEntity(rawEntity) {
return entityString
}
}
return nil
}
private func decodedNumericEntity(_ rawEntity: ContiguousArray<UInt8>) -> String? {
assert(rawEntity[0] == numberSignCharacter)
var decodedNumber: UInt32?
if rawEntity[1] == xCharacter || rawEntity[1] == XCharacter { // Hex?
decodedNumber = decodedHexEntity(rawEntity)
}
else {
scanned.append(ch)
decodedNumber = decodedDecimalEntity(rawEntity)
}
if let decodedNumber {
return stringWithValue(decodedNumber)
}
return nil
}
private func decodedHexEntity(_ rawEntity: ContiguousArray<UInt8>) -> UInt32? {
assert(rawEntity[0] == numberSignCharacter)
assert(rawEntity[1] == xCharacter || rawEntity[1] == XCharacter)
var number: UInt32 = 0
var i = 0
for byte in rawEntity {
if i < 2 { // Skip first two characters: #x or #X
i += 1
continue
}
if byte == 0 { // rawEntity is null-terminated
break
}
var digit: UInt32?
switch byte {
case zeroCharacter...nineCharacter: // 0-9
digit = UInt32(byte - zeroCharacter)
case aCharacter...fCharacter: // a-f
digit = UInt32((byte - aCharacter) + 10)
case ACharacter...FCharacter: // a-f
digit = UInt32((byte - ACharacter) + 10)
default:
return nil
}
guard let digit else {
return nil // Shouldnt get here  handled by default case  but we need to bind digit
}
number = (number * 16) + digit
if number > maxUnicodeNumber {
return nil
}
}
return scanned
if number == 0 {
return nil
}
static let semicolonCharacter = Character(";")
return number
}
func scanEntityValue() -> String? {
private func decodedDecimalEntity(_ rawEntity: ContiguousArray<UInt8>) -> UInt32? {
let initialScanLocation = scanLocation
let maxEntityLength = 20 // Its probably smaller, but this is just for sanity.
assert(rawEntity[0] == numberSignCharacter)
assert(rawEntity[1] != xCharacter && rawEntity[1] != XCharacter) // not hex
var number: UInt32 = 0
var isFirstCharacter = true
// Convert, for instance, [51, 57] to 39
for byte in rawEntity {
if isFirstCharacter { // first character is #
isFirstCharacter = false
continue
}
if byte == 0 { // rawEntity is null-terminated
break
}
// Be sure its a digit 0-9
if byte < zeroCharacter || byte > nineCharacter {
return nil
}
let digit = UInt32(byte - zeroCharacter)
number = (number * 10) + digit
if number > maxUnicodeNumber {
return nil
}
}
if number == 0 {
return nil
}
return number
}
private func rawEntityValue(_ buffer: UnsafeBufferPointer<UInt8>, _ byteCount: Int, _ sourceLocation: inout Int) -> ContiguousArray<UInt8>? {
// sourceLocation points to the & character.
let savedSourceLocation = sourceLocation
let maxEntityCharacters = 36 // Longest current entity is &CounterClockwiseContourIntegral;
var entityCharacters: ContiguousArray<UInt8> = [0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
0, 0, 0, 0, 0, // 20 characters
0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
0, 0, 0, 0, 0, // 35 characters
0] // nil-terminated last character
var entityCharactersIndex = 0
while true {
guard let ch = currentCharacter else {
break
}
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
break
}
if ch == Self.semicolonCharacter {
let entityRange = initialScanLocation..<scanLocation
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
scanLocation = initialScanLocation + 1
return nil
}
scanLocation = scanLocation + 1
return decodedEntity
}
scanLocation += 1
if scanLocation - initialScanLocation > maxEntityLength {
break
}
if isAtEnd {
break
}
}
return nil
}
}
extension String {
func indexForInt(_ i: Int) -> Index? {
index(startIndex, offsetBy: i, limitedBy: endIndex)
}
func characterAtIntIndex(_ i: Int) -> Character? {
guard let index = indexForInt(i) else {
sourceLocation += 1
if sourceLocation >= byteCount || entityCharactersIndex >= maxEntityCharacters { // did not parse entity
sourceLocation = savedSourceLocation
return nil
}
return self[index]
let ch = buffer[sourceLocation]
if ch == semicolonCharacter { // End of entity?
return entityCharacters
}
func substring(intRange: Range<Int>) -> String? {
guard let rangeLower = indexForInt(intRange.lowerBound) else {
// Make sure character is in 0-9, A-Z, a-z, #
if ch < zeroCharacter && ch != numberSignCharacter {
return nil
}
guard let rangeUpper = indexForInt(intRange.upperBound) else {
if ch > nineCharacter && ch < ACharacter {
return nil
}
if ch > ZCharacter && ch < aCharacter {
return nil
}
if ch > zCharacter {
return nil
}
return String(self[rangeLower..<rangeUpper])
}
}
entityCharacters[entityCharactersIndex] = ch
/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
private func decodedEntity(_ rawEntity: String) -> String? {
var s = rawEntity
if s.hasPrefix("&") {
s.removeFirst()
entityCharactersIndex += 1
}
if s.hasSuffix(";") {
s.removeLast()
}
if let decodedEntity = entitiesDictionary[s] {
return decodedEntity
}
if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
let scanner = Scanner(string: s)
scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
var hexValue: UInt64 = 0
if scanner.scanHexInt64(&hexValue) {
return stringWithValue(UInt32(hexValue))
}
return nil
}
else if s.hasPrefix("#") {
s.removeFirst()
guard let value = UInt32(s), value >= 1 else {
return nil
}
return stringWithValue(value)
}
return nil
}
private func stringWithValue(_ value: UInt32) -> String? {
@ -216,7 +305,7 @@ private func stringWithValue(_ value: UInt32) -> String? {
var modifiedValue = value
if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
if value >= 128 && value < 160 {
modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
}

View File

@ -16,7 +16,7 @@ final class EntityDecodingTests: XCTestCase {
// Bug found by Manton Reece the &#39; entity was not getting decoded by NetNewsWire in JSON Feeds from micro.blog.
let s = "These are the times that try men&#39;s souls."
let decoded = HTMLEntityDecoder.decodedString(s)
let decoded = decodedString(s)
XCTAssertEqual(decoded, "These are the times that try men's souls.")
}
@ -24,7 +24,7 @@ final class EntityDecodingTests: XCTestCase {
func testEntityAtBeginning() {
let s = "&#39;leading single quote"
let decoded = HTMLEntityDecoder.decodedString(s)
let decoded = decodedString(s)
XCTAssertEqual(decoded, "'leading single quote")
}
@ -32,7 +32,7 @@ final class EntityDecodingTests: XCTestCase {
func testEntityAtEnd() {
let s = "trailing single quote&#39;"
let decoded = HTMLEntityDecoder.decodedString(s)
let decoded = decodedString(s)
XCTAssertEqual(decoded, "trailing single quote'")
}
@ -40,7 +40,7 @@ final class EntityDecodingTests: XCTestCase {
func testEntityInMiddle() {
let s = "entity &ccedil; in middle"
let decoded = HTMLEntityDecoder.decodedString(s)
let decoded = decodedString(s)
XCTAssertEqual(decoded, "entity ç in middle")
}
@ -48,43 +48,100 @@ final class EntityDecodingTests: XCTestCase {
func testMultipleEntitiesInARow() {
let s = "&ccedil;&egrave;mult&#8230;&#x2026;iple &#39;&aelig;&quot;entities&divide;&hearts;"
let decoded = HTMLEntityDecoder.decodedString(s)
let decoded = decodedString(s)
XCTAssertEqual(decoded, "çèmult……iple 'æ\"entities÷♥")
}
func testFakeoutEntities() {
var s = "&&;&#;&#x;&#X;& ;&# \t;&\r&&&&&;"
XCTAssertEqual(decodedString(s), s)
s = "#;&#x;&#X;& &#123"
XCTAssertEqual(decodedString(s), s)
s = " &lsquo "
XCTAssertEqual(decodedString(s), s)
s = "&&&&&&&&&&&&&&&&&&&;;;;;;&;&;&##;#X::&;&;&;&"
XCTAssertEqual(decodedString(s), s)
}
func testFakeSquirrelEntities() {
var s = "&squirrel;"
XCTAssertEqual(decodedString(s), s)
s = "&squirrel;&#squirrel;"
XCTAssertEqual(decodedString(s), s)
s = "&squirrel;&#squirrel;&#xsquirrel;&#Xsquirrel;"
XCTAssertEqual(decodedString(s), s)
s = "&#39squirrel;"
XCTAssertEqual(decodedString(s), s)
s = "&squirrel;&#squirrel;&#xsquirrel;&#Xsquirrel;&#39squirrel;"
XCTAssertEqual(decodedString(s), s)
s = "&squirrel;&#squirrel;&#xsquirrel;&#Xsquirrel;&#39squirrel;&&;;;;&;&;&#squi#;#rrelX::&;&;&;&"
XCTAssertEqual(decodedString(s), s)
}
func testLongFakeoutEntities() {
var s = "&thisIsALongNotRealEntityThatShouldBeHandledPerfectlyWellByTheParserBasicallyIgnored;"
XCTAssertEqual(decodedString(s), s)
s = "&#89437652094387502948360194365209348650293486752093487652093486752;"
XCTAssertEqual(decodedString(s), s)
s = "&#89437652094387502948360194365;"
XCTAssertEqual(decodedString(s), s)
s = "&#894376520943875029483601943651;"
XCTAssertEqual(decodedString(s), s)
s = "&#1114112;"
XCTAssertEqual(decodedString(s), s)
s = "&#x110000;"
XCTAssertEqual(decodedString(s), s)
}
func testOnlyEntity() {
var s = "&#8230;"
var decoded = HTMLEntityDecoder.decodedString(s)
var decoded = decodedString(s)
XCTAssertEqual(decoded, "")
s = "&#x2026;"
decoded = HTMLEntityDecoder.decodedString(s)
decoded = decodedString(s)
XCTAssertEqual(decoded, "")
s = "&#039;"
decoded = HTMLEntityDecoder.decodedString(s)
decoded = decodedString(s)
XCTAssertEqual(decoded, "'")
s = "&#167;"
decoded = HTMLEntityDecoder.decodedString(s)
decoded = decodedString(s)
XCTAssertEqual(decoded, "§")
s = "&#XA3;"
decoded = HTMLEntityDecoder.decodedString(s)
decoded = decodedString(s)
XCTAssertEqual(decoded, "£")
}
func testPerformance() {
// 0.009 sec on my 2012 iMac.
// 0.003 sec on my M1 Mac Studio.
let s = stringForResource("DaringFireball", "html")
self.measure {
_ = HTMLEntityDecoder.decodedString(s)
}
}
self.measure {
_ = decodedString(s)
}
}
}
func stringForResource(_ filename: String, _ fileExtension: String) -> String {
@ -93,3 +150,8 @@ func stringForResource(_ filename: String, _ fileExtension: String) -> String {
let path = Bundle.module.path(forResource: filename, ofType: fileExtension)!
return try! String(contentsOfFile: path)
}
func decodedString(_ s: String) -> String {
HTMLEntityDecoder.decodedString(s)!
}