Brent Simmons 1e90237e7e Fix bug decoding ' entities.
1. Fix typo in the entities dictionary — add the actual ' character.
2. Add EntityDecodingTests.swift and a test for this.
2017-12-30 10:24:44 -08:00

433 lines
9.6 KiB
Objective-C
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// NSString+RSParser.m
// RSParser
//
// Created by Brent Simmons on 9/25/15.
// Copyright © 2015 Ranchero Software, LLC. All rights reserved.
//
#import <CommonCrypto/CommonDigest.h>
#import <RSParser/NSString+RSParser.h>
@interface NSScanner (RSParser)
- (BOOL)rs_scanEntityValue:(NSString * _Nullable * _Nullable)decodedEntity;
@end
@implementation NSString (RSParser)
- (BOOL)rsparser_contains:(NSString *)s {
return [self rangeOfString:s].location != NSNotFound;
}
- (NSString *)rsparser_stringByDecodingHTMLEntities {
@autoreleasepool {
NSScanner *scanner = [[NSScanner alloc] initWithString:self];
scanner.charactersToBeSkipped = nil;
NSMutableString *result = [[NSMutableString alloc] init];
while (true) {
NSString *scannedString = nil;
if ([scanner scanUpToString:@"&" intoString:&scannedString]) {
[result appendString:scannedString];
}
if (scanner.isAtEnd) {
break;
}
NSUInteger savedScanLocation = scanner.scanLocation;
NSString *decodedEntity = nil;
if ([scanner rs_scanEntityValue:&decodedEntity]) {
[result appendString:decodedEntity];
}
else {
[result appendString:@"&"];
scanner.scanLocation = savedScanLocation + 1;
}
if (scanner.isAtEnd) {
break;
}
}
if ([self isEqualToString:result]) {
return self;
}
return [result copy];
}
}
static NSDictionary *RSEntitiesDictionary(void);
static NSString *RSParserStringWithValue(unichar value);
- (NSString * _Nullable)rs_stringByDecodingEntity {
// self may or may not have outer & and ; characters.
NSMutableString *s = [self mutableCopy];
if ([s hasPrefix:@"&"]) {
[s deleteCharactersInRange:NSMakeRange(0, 1)];
}
if ([s hasSuffix:@";"]) {
[s deleteCharactersInRange:NSMakeRange(s.length - 1, 1)];
}
NSDictionary *entitiesDictionary = RSEntitiesDictionary();
NSString *decodedEntity = entitiesDictionary[self];
if (decodedEntity) {
return decodedEntity;
}
if ([s hasPrefix:@"#x"]) { // Hex
NSScanner *scanner = [[NSScanner alloc] initWithString:s];
scanner.charactersToBeSkipped = [NSCharacterSet characterSetWithCharactersInString:@"#x"];
unsigned int hexValue = 0;
if ([scanner scanHexInt:&hexValue]) {
return RSParserStringWithValue((unichar)hexValue);
}
return nil;
}
else if ([s hasPrefix:@"#"]) {
[s deleteCharactersInRange:NSMakeRange(0, 1)];
NSInteger value = s.integerValue;
if (value < 1) {
return nil;
}
return RSParserStringWithValue((unichar)value);
}
return nil;
}
- (NSData *)_rsparser_md5HashData {
NSData *data = [self dataUsingEncoding:NSUTF8StringEncoding];
unsigned char hash[CC_MD5_DIGEST_LENGTH];
CC_MD5(data.bytes, (CC_LONG)data.length, hash);
return [NSData dataWithBytes:(const void *)hash length:CC_MD5_DIGEST_LENGTH];
}
- (NSString *)rsparser_md5Hash {
NSData *md5Data = [self _rsparser_md5HashData];
const Byte *bytes = md5Data.bytes;
return [NSString stringWithFormat:@"%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15]];
}
@end
@implementation NSScanner (RSParser)
- (BOOL)rs_scanEntityValue:(NSString * _Nullable * _Nullable)decodedEntity {
NSString *s = self.string;
NSUInteger initialScanLocation = self.scanLocation;
static NSUInteger maxEntityLength = 20; // Its probably smaller, but this is just for sanity.
while (true) {
unichar ch = [s characterAtIndex:self.scanLocation];
if ([NSCharacterSet.whitespaceAndNewlineCharacterSet characterIsMember:ch]) {
break;
}
if (ch == ';') {
if (!decodedEntity) {
return YES;
}
NSString *rawEntity = [s substringWithRange:NSMakeRange(initialScanLocation + 1, (self.scanLocation - initialScanLocation) - 1)];
*decodedEntity = [rawEntity rs_stringByDecodingEntity];
self.scanLocation = self.scanLocation + 1;
return *decodedEntity != nil;
}
self.scanLocation = self.scanLocation + 1;
if (self.scanLocation - initialScanLocation > maxEntityLength) {
break;
}
if (self.isAtEnd) {
break;
}
}
return NO;
}
@end
static NSString *RSParserStringWithValue(unichar value) {
return [[NSString alloc] initWithFormat:@"%C", value];
}
static NSDictionary *RSEntitiesDictionary(void) {
static NSDictionary *entitiesDictionary = nil;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
entitiesDictionary =
@{@"#034": @"\"",
@"#038": @"&",
@"#38": @"&",
@"#039": @"'",
@"#145": @"",
@"#146": @"",
@"#147": @"",
@"#148": @"",
@"#149": @"",
@"#150": @"-",
@"#151": @"",
@"#153": @"",
@"#160": RSParserStringWithValue(160),
@"#161": @"¡",
@"#162": @"¢",
@"#163": @"£",
@"#164": @"?",
@"#165": @"¥",
@"#166": @"?",
@"#167": @"§",
@"#168": @"¨",
@"#169": @"©",
@"#170": @"©",
@"#171": @"«",
@"#172": @"¬",
@"#173": @"¬",
@"#174": @"®",
@"#175": @"¯",
@"#176": @"°",
@"#177": @"±",
@"#178": @" ",
@"#179": @" ",
@"#180": @"´",
@"#181": @"µ",
@"#182": @"µ",
@"#183": @"·",
@"#184": @"¸",
@"#185": @" ",
@"#186": @"º",
@"#187": @"»",
@"#188": @"1/4",
@"#189": @"1/2",
@"#190": @"1/2",
@"#191": @"¿",
@"#192": @"À",
@"#193": @"Á",
@"#194": @"Â",
@"#195": @"Ã",
@"#196": @"Ä",
@"#197": @"Å",
@"#198": @"Æ",
@"#199": @"Ç",
@"#200": @"È",
@"#201": @"É",
@"#202": @"Ê",
@"#203": @"Ë",
@"#204": @"Ì",
@"#205": @"Í",
@"#206": @"Î",
@"#207": @"Ï",
@"#208": @"?",
@"#209": @"Ñ",
@"#210": @"Ò",
@"#211": @"Ó",
@"#212": @"Ô",
@"#213": @"Õ",
@"#214": @"Ö",
@"#215": @"x",
@"#216": @"Ø",
@"#217": @"Ù",
@"#218": @"Ú",
@"#219": @"Û",
@"#220": @"Ü",
@"#221": @"Y",
@"#222": @"?",
@"#223": @"ß",
@"#224": @"à",
@"#225": @"á",
@"#226": @"â",
@"#227": @"ã",
@"#228": @"ä",
@"#229": @"å",
@"#230": @"æ",
@"#231": @"ç",
@"#232": @"è",
@"#233": @"é",
@"#234": @"ê",
@"#235": @"ë",
@"#236": @"ì",
@"#237": @"í",
@"#238": @"î",
@"#239": @"ï",
@"#240": @"?",
@"#241": @"ñ",
@"#242": @"ò",
@"#243": @"ó",
@"#244": @"ô",
@"#245": @"õ",
@"#246": @"ö",
@"#247": @"÷",
@"#248": @"ø",
@"#249": @"ù",
@"#250": @"ú",
@"#251": @"û",
@"#252": @"ü",
@"#253": @"y",
@"#254": @"?",
@"#255": @"ÿ",
@"#32": @" ",
@"#34": @"\"",
@"#39": @"'",
@"#8194": @" ",
@"#8195": @" ",
@"#8211": @"-",
@"#8212": @"",
@"#8216": @"",
@"#8217": @"",
@"#8220": @"",
@"#8221": @"",
@"#8230": @"",
@"#8617": RSParserStringWithValue(8617),
@"AElig": @"Æ",
@"Aacute": @"Á",
@"Acirc": @"Â",
@"Agrave": @"À",
@"Aring": @"Å",
@"Atilde": @"Ã",
@"Auml": @"Ä",
@"Ccedil": @"Ç",
@"Dstrok": @"?",
@"ETH": @"?",
@"Eacute": @"É",
@"Ecirc": @"Ê",
@"Egrave": @"È",
@"Euml": @"Ë",
@"Iacute": @"Í",
@"Icirc": @"Î",
@"Igrave": @"Ì",
@"Iuml": @"Ï",
@"Ntilde": @"Ñ",
@"Oacute": @"Ó",
@"Ocirc": @"Ô",
@"Ograve": @"Ò",
@"Oslash": @"Ø",
@"Otilde": @"Õ",
@"Ouml": @"Ö",
@"Pi": @"Π",
@"THORN": @"?",
@"Uacute": @"Ú",
@"Ucirc": @"Û",
@"Ugrave": @"Ù",
@"Uuml": @"Ü",
@"Yacute": @"Y",
@"aacute": @"á",
@"acirc": @"â",
@"acute": @"´",
@"aelig": @"æ",
@"agrave": @"à",
@"amp": @"&",
@"apos": @"'",
@"aring": @"å",
@"atilde": @"ã",
@"auml": @"ä",
@"brkbar": @"?",
@"brvbar": @"?",
@"ccedil": @"ç",
@"cedil": @"¸",
@"cent": @"¢",
@"copy": @"©",
@"curren": @"?",
@"deg": @"°",
@"die": @"?",
@"divide": @"÷",
@"eacute": @"é",
@"ecirc": @"ê",
@"egrave": @"è",
@"eth": @"?",
@"euml": @"ë",
@"euro": @"",
@"frac12": @"1/2",
@"frac14": @"1/4",
@"frac34": @"3/4",
@"gt": @">",
@"hearts": @"",
@"hellip": @"",
@"iacute": @"í",
@"icirc": @"î",
@"iexcl": @"¡",
@"igrave": @"ì",
@"iquest": @"¿",
@"iuml": @"ï",
@"laquo": @"«",
@"ldquo": @"",
@"lsquo": @"",
@"lt": @"<",
@"macr": @"¯",
@"mdash": @"",
@"micro": @"µ",
@"middot": @"·",
@"ndash": @"-",
@"not": @"¬",
@"ntilde": @"ñ",
@"oacute": @"ó",
@"ocirc": @"ô",
@"ograve": @"ò",
@"ordf": @"ª",
@"ordm": @"º",
@"oslash": @"ø",
@"otilde": @"õ",
@"ouml": @"ö",
@"para": @"",
@"pi": @"π",
@"plusmn": @"±",
@"pound": @"£",
@"quot": @"\"",
@"raquo": @"»",
@"rdquo": @"",
@"reg": @"®",
@"rsquo": @"",
@"sect": @"§",
@"shy": @" ",
@"sup1": @" ",
@"sup2": @" ",
@"sup3": @" ",
@"szlig": @"ß",
@"thorn": @"?",
@"times": @"x",
@"trade": @"",
@"uacute": @"ú",
@"ucirc": @"û",
@"ugrave": @"ù",
@"uml": @"¨",
@"uuml": @"ü",
@"yacute": @"y",
@"yen": @"¥",
@"yuml": @"ÿ",
@"infin": @"",
@"nbsp": RSParserStringWithValue(160),
@"#x21A9": RSParserStringWithValue(8617),
@"#xFE0E": RSParserStringWithValue(65038),
@"#x2019": RSParserStringWithValue(8217),
@"#x2026": RSParserStringWithValue(8230),
@"#x201C": RSParserStringWithValue(8220),
@"#x201D": RSParserStringWithValue(8221),
@"#x2014": RSParserStringWithValue(8212)};
});
return entitiesDictionary;
}