NetNewsWire/Frameworks/RSParser/Utilities/NSData+RSParser.m

//
//  NSData+RSParser.m
//  RSParser
//
//  Created by Brent Simmons on 6/24/17.
//  Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//

#import <RSParser/NSData+RSParser.h>

/* TODO: find real-world cases where the isProbably* cases fail when they should succeed, and add them to tests.*/

static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes);
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes);
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes);
static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes);
static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes);

@implementation NSData (RSParser)

- (BOOL)isProbablyHTML {

	return bytesAreProbablyHTML(self.bytes, self.length);
}

- (BOOL)isProbablyXML {

	return bytesAreProbablyXML(self.bytes, self.length);
}

- (BOOL)isProbablyJSON {

	return bytesStartWithStringIgnoringWhitespace("{", self.bytes, self.length);
}

- (BOOL)isProbablyJSONFeed {

	if (![self isProbablyJSON]) {
		return NO;
	}
	return didFindString("://jsonfeed.org/version/", self.bytes, self.length);
}

- (BOOL)isProbablyRSSInJSON {

	if (![self isProbablyJSON]) {
		return NO;
	}
	const char *bytes = self.bytes;
	NSUInteger length = self.length;
	return didFindString("rss", bytes, length) && didFindString("channel", bytes, length) && didFindString("item", bytes, length);
}

- (BOOL)isProbablyRSS {

	if (bytesStartWithRSS(self.bytes, self.length)) { // Macworld’s RSS feed does not start with xml header.
		return YES;
	}
	if (![self isProbablyXML]) {
		return NO;
	}

	if (didFindString("<rss", self.bytes, self.length) || didFindString("<rdf:RDF", self.bytes, self.length)) {
		return YES;
	}

	// At this writing (7 Dec. 2017), https://www.natashatherobot.com/feed/ is missing an opening <rss> tag, but it should be parsed anyway. It does have some other distinct RSS markers we can find.
	return (didFindString("<channel>", self.bytes, self.length) && didFindString("<pubDate>", self.bytes, self.length));
}

- (BOOL)isProbablyAtom {

	if (![self isProbablyXML]) {
		return NO;
	}

	return didFindString("<feed", self.bytes, self.length);
}

@end


static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes) {

	char *foundString = strnstr(bytes, string, numberOfBytes);
	return foundString != NULL;
}

static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes) {

	NSUInteger i = 0;
	for (i = 0; i < numberOfBytes; i++) {

		const char ch = bytes[i];
		if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t') {
			continue;
		}

		if (ch == string[0]) {
			return strnstr(bytes, string, numberOfBytes) == bytes + i;
		}
		break;
	}
	return NO;
}

static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes) {

	if (didFindString("<html", bytes, numberOfBytes)) {
		return YES;
	}
	if (didFindString("<HTML", bytes, numberOfBytes)) {
		return YES;
	}

	if (didFindString("<body", bytes, numberOfBytes)) {
		return YES;
	}
	if (didFindString("<meta", bytes, numberOfBytes)) {
		return YES;
	}

	if (didFindString("<", bytes, numberOfBytes)) {
		if (didFindString("doctype html", bytes, numberOfBytes)) {
			return YES;
		}
		if (didFindString("DOCTYPE html", bytes, numberOfBytes)) {
			return YES;
		}
		if (didFindString("DOCTYPE HTML", bytes, numberOfBytes)) {
			return YES;
		}
	}

	return NO;
}

static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes) {

	return bytesStartWithStringIgnoringWhitespace("<?xml", bytes, numberOfBytes);
}

static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes) {

	return bytesStartWithStringIgnoringWhitespace("<rss", bytes, numberOfBytes);
}
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+								//
 								//  NSData+RSParser.m
 								//  RSParser
 								//
 								//  Created by Brent Simmons on 6/24/17.
 								//  Copyright © 2017 Ranchero Software, LLC. All rights reserved.
 								//
-												Fix builder errors, mostly in RSParser.

											
										
										
											2017-10-04 22:28:48 +02:00
+								#import <RSParser/NSData+RSParser.h>
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
 								/* TODO: find real-world cases where the isProbably* cases fail when they should succeed, and add them to tests.*/
 								static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes);
 								static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes);
 								static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes);
-												Make it build. Add a README.

											
										
										
											2017-06-26 01:32:07 +02:00
+								static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes);
-												Fix bug detecting Macworld’s RSS feed as an RSS feed. The feed doesn’t start with the standard XML header.

											
										
										
											2017-11-29 06:29:09 +01:00
+								static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes);
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
 								@implementation NSData (RSParser)
 								- (BOOL)isProbablyHTML {
 									return bytesAreProbablyHTML(self.bytes, self.length);
 								}
 								- (BOOL)isProbablyXML {
 									return bytesAreProbablyXML(self.bytes, self.length);
 								}
 								- (BOOL)isProbablyJSON {
 									return bytesStartWithStringIgnoringWhitespace("{", self.bytes, self.length);
 								}
 								- (BOOL)isProbablyJSONFeed {
 									if (![self isProbablyJSON]) {
 										return NO;
 									}
-												When detecting and parsing a potential JSON Feed, allow for the version URL to have the wrong scheme, as it does (at this writing) in https://pxlnv.com/feed/json/

Fix #347.

											
										
										
											2018-02-15 05:56:02 +01:00
+									return didFindString("://jsonfeed.org/version/", self.bytes, self.length);
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+								}
-												Make progress on getting RSParser.framework to build.

											
										
										
											2017-06-25 23:06:01 +02:00
+								- (BOOL)isProbablyRSSInJSON {
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
 									if (![self isProbablyJSON]) {
 										return NO;
 									}
 									const char *bytes = self.bytes;
 									NSUInteger length = self.length;
 									return didFindString("rss", bytes, length) && didFindString("channel", bytes, length) && didFindString("item", bytes, length);
 								}
 								- (BOOL)isProbablyRSS {
-												Fix bug detecting Macworld’s RSS feed as an RSS feed. The feed doesn’t start with the standard XML header.

											
										
										
											2017-11-29 06:29:09 +01:00
+									if (bytesStartWithRSS(self.bytes, self.length)) { // Macworld’s RSS feed does not start with xml header.
 										return YES;
 									}
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+									if (![self isProbablyXML]) {
 										return NO;
 									}
-												Add case for detecting <https://www.natashatherobot.com/feed/> as an RSS feed.

It’s missing the opening <rss> tag, but it has enough other distinct markers that we can detect it as RSS.

Add two tests to make sure it’s detected and that the parser handles it.

											
										
										
											2017-12-08 05:05:58 +01:00
+									if (didFindString("<rss", self.bytes, self.length) || didFindString("<rdf:RDF", self.bytes, self.length)) {
 										return YES;
 									}
 									// At this writing (7 Dec. 2017), https://www.natashatherobot.com/feed/ is missing an opening <rss> tag, but it should be parsed anyway. It does have some other distinct RSS markers we can find.
 									return (didFindString("<channel>", self.bytes, self.length) && didFindString("<pubDate>", self.bytes, self.length));
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+								}
 								- (BOOL)isProbablyAtom {
 									if (![self isProbablyXML]) {
 										return NO;
 									}
-												Make it build. Add a README.

											
										
										
											2017-06-26 01:32:07 +02:00
+									return didFindString("<feed", self.bytes, self.length);
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+								}
 								@end
 								static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes) {
 									char *foundString = strnstr(bytes, string, numberOfBytes);
 									return foundString != NULL;
 								}
 								static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes) {
 									NSUInteger i = 0;
 									for (i = 0; i < numberOfBytes; i++) {
-												Make it build. Add a README.

											
										
										
											2017-06-26 01:32:07 +02:00
+										const char ch = bytes[i];
 										if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t') {
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+											continue;
 										}
 										if (ch == string[0]) {
 											return strnstr(bytes, string, numberOfBytes) == bytes + i;
 										}
 										break;
 									}
 									return NO;
 								}
 								static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes) {
 									if (didFindString("<html", bytes, numberOfBytes)) {
 										return YES;
 									}
 									if (didFindString("<HTML", bytes, numberOfBytes)) {
 										return YES;
 									}
 									if (didFindString("<body", bytes, numberOfBytes)) {
 										return YES;
 									}
 									if (didFindString("<meta", bytes, numberOfBytes)) {
 										return YES;
 									}
-												Make it build. Add a README.

											
										
										
											2017-06-26 01:32:07 +02:00
+									if (didFindString("<", bytes, numberOfBytes)) {
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+										if (didFindString("doctype html", bytes, numberOfBytes)) {
 											return YES;
 										}
 										if (didFindString("DOCTYPE html", bytes, numberOfBytes)) {
 											return YES;
 										}
 										if (didFindString("DOCTYPE HTML", bytes, numberOfBytes)) {
 											return YES;
 										}
 									}
 									return NO;
 								}
 								static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes) {
-												Make it build. Add a README.

											
										
										
											2017-06-26 01:32:07 +02:00
+									return bytesStartWithStringIgnoringWhitespace("<?xml", bytes, numberOfBytes);
-												Start RSS-in-JSON parser.

											
										
										
											2017-06-25 19:23:30 +02:00
+								}
-												Fix bug detecting Macworld’s RSS feed as an RSS feed. The feed doesn’t start with the standard XML header.

											
										
										
											2017-11-29 06:29:09 +01:00
+								static BOOL bytesStartWithRSS(const char *bytes, NSUInteger numberOfBytes) {
 									return bytesStartWithStringIgnoringWhitespace("<rss", bytes, numberOfBytes);
 								}