2017-06-25 19:23:30 +02:00
|
|
|
|
//
|
|
|
|
|
// NSData+RSParser.m
|
|
|
|
|
// RSParser
|
|
|
|
|
//
|
|
|
|
|
// Created by Brent Simmons on 6/24/17.
|
|
|
|
|
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
|
2017-10-04 22:28:48 +02:00
|
|
|
|
#import <RSParser/NSData+RSParser.h>
|
2017-06-25 19:23:30 +02:00
|
|
|
|
|
|
|
|
|
/* TODO: find real-world cases where the isProbably* cases fail when they should succeed, and add them to tests.*/
|
|
|
|
|
|
|
|
|
|
static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes);
|
|
|
|
|
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes);
|
|
|
|
|
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes);
|
2017-06-26 01:32:07 +02:00
|
|
|
|
static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes);
|
2017-11-29 06:29:09 +01:00
|
|
|
|
static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes);
|
2017-06-25 19:23:30 +02:00
|
|
|
|
|
|
|
|
|
@implementation NSData (RSParser)
|
|
|
|
|
|
|
|
|
|
- (BOOL)isProbablyHTML {
|
|
|
|
|
|
|
|
|
|
return bytesAreProbablyHTML(self.bytes, self.length);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- (BOOL)isProbablyXML {
|
|
|
|
|
|
|
|
|
|
return bytesAreProbablyXML(self.bytes, self.length);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- (BOOL)isProbablyJSON {
|
|
|
|
|
|
|
|
|
|
return bytesStartWithStringIgnoringWhitespace("{", self.bytes, self.length);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- (BOOL)isProbablyJSONFeed {
|
|
|
|
|
|
|
|
|
|
if (![self isProbablyJSON]) {
|
|
|
|
|
return NO;
|
|
|
|
|
}
|
2018-02-15 05:56:02 +01:00
|
|
|
|
return didFindString("://jsonfeed.org/version/", self.bytes, self.length);
|
2017-06-25 19:23:30 +02:00
|
|
|
|
}
|
|
|
|
|
|
2017-06-25 23:06:01 +02:00
|
|
|
|
- (BOOL)isProbablyRSSInJSON {
|
2017-06-25 19:23:30 +02:00
|
|
|
|
|
|
|
|
|
if (![self isProbablyJSON]) {
|
|
|
|
|
return NO;
|
|
|
|
|
}
|
|
|
|
|
const char *bytes = self.bytes;
|
|
|
|
|
NSUInteger length = self.length;
|
|
|
|
|
return didFindString("rss", bytes, length) && didFindString("channel", bytes, length) && didFindString("item", bytes, length);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- (BOOL)isProbablyRSS {
|
|
|
|
|
|
2017-11-29 06:29:09 +01:00
|
|
|
|
if (bytesStartWithRSS(self.bytes, self.length)) { // Macworld’s RSS feed does not start with xml header.
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
2017-06-25 19:23:30 +02:00
|
|
|
|
if (![self isProbablyXML]) {
|
|
|
|
|
return NO;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-08 05:05:58 +01:00
|
|
|
|
if (didFindString("<rss", self.bytes, self.length) || didFindString("<rdf:RDF", self.bytes, self.length)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// At this writing (7 Dec. 2017), https://www.natashatherobot.com/feed/ is missing an opening <rss> tag, but it should be parsed anyway. It does have some other distinct RSS markers we can find.
|
|
|
|
|
return (didFindString("<channel>", self.bytes, self.length) && didFindString("<pubDate>", self.bytes, self.length));
|
2017-06-25 19:23:30 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- (BOOL)isProbablyAtom {
|
|
|
|
|
|
|
|
|
|
if (![self isProbablyXML]) {
|
|
|
|
|
return NO;
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-26 01:32:07 +02:00
|
|
|
|
return didFindString("<feed", self.bytes, self.length);
|
2017-06-25 19:23:30 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes) {
|
|
|
|
|
|
|
|
|
|
char *foundString = strnstr(bytes, string, numberOfBytes);
|
|
|
|
|
return foundString != NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes) {
|
|
|
|
|
|
|
|
|
|
NSUInteger i = 0;
|
|
|
|
|
for (i = 0; i < numberOfBytes; i++) {
|
|
|
|
|
|
2017-06-26 01:32:07 +02:00
|
|
|
|
const char ch = bytes[i];
|
|
|
|
|
if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t') {
|
2017-06-25 19:23:30 +02:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ch == string[0]) {
|
|
|
|
|
return strnstr(bytes, string, numberOfBytes) == bytes + i;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return NO;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes) {
|
|
|
|
|
|
|
|
|
|
if (didFindString("<html", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
if (didFindString("<HTML", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (didFindString("<body", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
if (didFindString("<meta", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-26 01:32:07 +02:00
|
|
|
|
if (didFindString("<", bytes, numberOfBytes)) {
|
2017-06-25 19:23:30 +02:00
|
|
|
|
if (didFindString("doctype html", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
if (didFindString("DOCTYPE html", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
if (didFindString("DOCTYPE HTML", bytes, numberOfBytes)) {
|
|
|
|
|
return YES;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return NO;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes) {
|
|
|
|
|
|
2017-06-26 01:32:07 +02:00
|
|
|
|
return bytesStartWithStringIgnoringWhitespace("<?xml", bytes, numberOfBytes);
|
2017-06-25 19:23:30 +02:00
|
|
|
|
}
|
|
|
|
|
|
2017-11-29 06:29:09 +01:00
|
|
|
|
static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes) {
|
|
|
|
|
|
|
|
|
|
return bytesStartWithStringIgnoringWhitespace("<rss", bytes, numberOfBytes);
|
|
|
|
|
}
|