mirror of
https://github.com/Ranchero-Software/NetNewsWire.git
synced 2024-12-27 10:02:37 +01:00
3d72ba4b44
It’s missing the opening <rss> tag, but it has enough other distinct markers that we can detect it as RSS. Add two tests to make sure it’s detected and that the parser handles it.
147 lines
3.8 KiB
Objective-C
147 lines
3.8 KiB
Objective-C
//
|
||
// NSData+RSParser.m
|
||
// RSParser
|
||
//
|
||
// Created by Brent Simmons on 6/24/17.
|
||
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
|
||
//
|
||
|
||
#import <RSParser/NSData+RSParser.h>
|
||
|
||
/* TODO: find real-world cases where the isProbably* cases fail when they should succeed, and add them to tests.*/
|
||
|
||
static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes);
|
||
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes);
|
||
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes);
|
||
static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes);
|
||
static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes);
|
||
|
||
@implementation NSData (RSParser)
|
||
|
||
- (BOOL)isProbablyHTML {
|
||
|
||
return bytesAreProbablyHTML(self.bytes, self.length);
|
||
}
|
||
|
||
- (BOOL)isProbablyXML {
|
||
|
||
return bytesAreProbablyXML(self.bytes, self.length);
|
||
}
|
||
|
||
- (BOOL)isProbablyJSON {
|
||
|
||
return bytesStartWithStringIgnoringWhitespace("{", self.bytes, self.length);
|
||
}
|
||
|
||
- (BOOL)isProbablyJSONFeed {
|
||
|
||
if (![self isProbablyJSON]) {
|
||
return NO;
|
||
}
|
||
return didFindString("https://jsonfeed.org/version/", self.bytes, self.length);
|
||
}
|
||
|
||
- (BOOL)isProbablyRSSInJSON {
|
||
|
||
if (![self isProbablyJSON]) {
|
||
return NO;
|
||
}
|
||
const char *bytes = self.bytes;
|
||
NSUInteger length = self.length;
|
||
return didFindString("rss", bytes, length) && didFindString("channel", bytes, length) && didFindString("item", bytes, length);
|
||
}
|
||
|
||
- (BOOL)isProbablyRSS {
|
||
|
||
if (bytesStartWithRSS(self.bytes, self.length)) { // Macworld’s RSS feed does not start with xml header.
|
||
return YES;
|
||
}
|
||
if (![self isProbablyXML]) {
|
||
return NO;
|
||
}
|
||
|
||
if (didFindString("<rss", self.bytes, self.length) || didFindString("<rdf:RDF", self.bytes, self.length)) {
|
||
return YES;
|
||
}
|
||
|
||
// At this writing (7 Dec. 2017), https://www.natashatherobot.com/feed/ is missing an opening <rss> tag, but it should be parsed anyway. It does have some other distinct RSS markers we can find.
|
||
return (didFindString("<channel>", self.bytes, self.length) && didFindString("<pubDate>", self.bytes, self.length));
|
||
}
|
||
|
||
- (BOOL)isProbablyAtom {
|
||
|
||
if (![self isProbablyXML]) {
|
||
return NO;
|
||
}
|
||
|
||
return didFindString("<feed", self.bytes, self.length);
|
||
}
|
||
|
||
@end
|
||
|
||
|
||
static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes) {
|
||
|
||
char *foundString = strnstr(bytes, string, numberOfBytes);
|
||
return foundString != NULL;
|
||
}
|
||
|
||
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes) {
|
||
|
||
NSUInteger i = 0;
|
||
for (i = 0; i < numberOfBytes; i++) {
|
||
|
||
const char ch = bytes[i];
|
||
if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t') {
|
||
continue;
|
||
}
|
||
|
||
if (ch == string[0]) {
|
||
return strnstr(bytes, string, numberOfBytes) == bytes + i;
|
||
}
|
||
break;
|
||
}
|
||
return NO;
|
||
}
|
||
|
||
static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes) {
|
||
|
||
if (didFindString("<html", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
if (didFindString("<HTML", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
|
||
if (didFindString("<body", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
if (didFindString("<meta", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
|
||
if (didFindString("<", bytes, numberOfBytes)) {
|
||
if (didFindString("doctype html", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
if (didFindString("DOCTYPE html", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
if (didFindString("DOCTYPE HTML", bytes, numberOfBytes)) {
|
||
return YES;
|
||
}
|
||
}
|
||
|
||
return NO;
|
||
}
|
||
|
||
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes) {
|
||
|
||
return bytesStartWithStringIgnoringWhitespace("<?xml", bytes, numberOfBytes);
|
||
}
|
||
|
||
static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes) {
|
||
|
||
return bytesStartWithStringIgnoringWhitespace("<rss", bytes, numberOfBytes);
|
||
}
|