Start RSS-in-JSON parser.

This commit is contained in:
Brent Simmons 2017-06-25 10:23:30 -07:00
parent 552ab693a3
commit 8589c21091
12 changed files with 482 additions and 4 deletions

View File

@ -8,19 +8,65 @@
import Foundation
// FeedParser knows about the various syndication feed types.
// It might be a good idea to do a plugin-style architecture here instead
// but feed formats dont appear all that often, so its probably not necessary.
public struct FeedParser {
static func feedType(parserData: ParserData) -> FeedType {
static let minNumberOfBytesRequired = 128
public static func feedType(parserData: ParserData) -> FeedType {
// Can call with partial data while still downloading, for instance.
// If theres not enough data, return .unknown. Ask again when theres more data.
// If its definitely not a feed, return .notAFeed.
return .unknown //stub
if parserData.data.count < minNumberOfBytesRequired {
return .unknown
}
static func parseFeed(parserData: ParserData) throws -> ParsedFeed? {
if parserData.data.isProbablyJSONFeed() {
return .jsonFeed
}
if parserData.data.isProbablyRSSInJSON() {
return .rssInJSON
}
if parserData.data.isProbablyHTML() {
return .notAFeed
}
return nil //stub
if parserData.data.isProbablyRSS() {
return .rss
}
if parserData.data.isProbablyAtom() {
return .atom
}
return .notAFeed
}
public static func parseFeed(parserData: ParserData) -> ParsedFeed? {
let type = feedType(parserData)
switch type {
case .jsonFeed:
return JSONFeedParser.parse(parserData)
case .rssInJSON:
return RSSInJSONFeedParser.parse(parserData)
case .rss:
return RSSParser.parse(parserData)
case .atom:
return AtomParser.parser(parserData)
case .unknown, .notAFeed:
return nil
}
}
}

View File

@ -0,0 +1,26 @@
//
// FeedParserError.swift
// RSParser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public struct FeedParserError: Error {
public enum FeedParserErrorType {
case rssChannelNotFound
case rssItemsNotFound
}
public let errorType: FeedParserErrorType
public init(_ errorType: FeedParserErrorType) {
self.errorType = errorType
}
}

View File

@ -0,0 +1,12 @@
//
// JSONDictionary.swift
// RSParser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
typealias JSONDictionary = [String: Any]
typealias JSONArray = [JSONDictionary]

View File

@ -0,0 +1,158 @@
//
// RSSInJSONParser.swift
// RSParser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
// See https://github.com/scripting/Scripting-News/blob/master/rss-in-json/README.md
// Also: http://cyber.harvard.edu/rss/rss.html
public struct RSSInJSONParser {
public static func parse(parserData: ParserData) throws -> ParsedFeed? {
do {
let parsedObject = try JSONSerialization.jsonObject(with: parserData.data)
guard let channelObject = parsedObject["channel"] as? JSONDictionary else {
throw FeedParserError(.rssChannelNotFound)
}
// Id bet money that in practice the items array wont always appear correctly inside the channel object.
// Id also bet that sometimes it gets called "items" instead of "item".
var itemsObject = channelObject["item"] as? JSONArray
if itemsObject == nil {
itemsObject = parsedObject["item"] as? JSONArray
}
if itemsObject == nil {
itemsObject = channelObject["items"] as? JSONArray
}
if itemsObject == nil {
itemsObject == parsedObject["items"] as? JSONArray
}
if itemsObject == nil {
throw FeedParserError(.rssItemsNotFound)
}
let title = channelObject["title"] as? String
let homePageURL = channelObject["link"] as? String
let feedURL = parserData.url
let feedDescription = channelObject["description"] as? String
let items = parseItems(itemsObject)
return ParsedFeed(type: .rssInJSON, title: title, homePageURL: homePageURL, feedURL: feedURL, feedDescription: feedDescription, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items)
}
catch { throw error }
}
}
private extension RSSInJSONParser {
static func parseItems(_ itemsObject: JSONArray) -> [ParsedItem] {
return itemsObject.flatMap{ (oneItemDictionary) -> ParsedItem in
return parsedItemWithDictionary(oneItemDictionary)
}
}
static func parsedItemWithDictionary(_ JSONDictionary: itemDictionary) -> ParsedItem? {
let externalURL = itemDictionary["link"] as? String
let title = itemDictionary["title"] as? String
var contentHTML = itemDictionary["description"] as? String
var contentText = nil
if contentHTML != nil && !(contentHTML!.contains("<")) {
contentText = contentHTML
contentHTML = nil
}
if contentHTML == nil && contentText == nil && title == nil {
return nil
}
var datePublished: Date = nil
if let datePublishedString = itemDictionary["pubDate"] as? String {
datePublished = RSDateWithString(datePublishedString as NSString)
}
let authorEmailAddress = itemDictionary["author"] as? String
var authors: [ParsedAuthor] = nil
if authorEmailAddress != nil {
let parsedAuthor = ParsedAuthor(name: nil, url: nil, avatarURL: nil, emailAddress: authorEmailAddress)
authors = [parsedAuthor]
}
var tags: [String]? = nil
if let categoryObject = itemDictionary["category"] as? JSONDictionary {
if let oneTag = categoryObject["#value"] {
tags = [oneTag]
}
}
else if let categoryArray = itemDictionary["category"] as? JSONArray {
tags = categoryArray.flatMap{ (oneCategoryDictionary) in
return oneCategoryDictionary["#value"]
}
}
var attachments: [ParsedAttachment]? = nil
if let enclosureObject = itemDictionary["enclosure"] as? JSONDictionary {
if let attachmentURL = enclosureObject["url"] as? String {
var attachmentSize = enclosureObject["length"] as? Int
if attachmentSize == nil {
if let attachmentSizeString = enclosureObject["length"] as? String {
attachmentSize = (attachmentSizeString as NSString).integerValue
}
}
let type = enclosureObject["type"] as? String
let oneAttachment = ParsedAttachment(url: attachmentURL, mimeType: type, title: nil, sizeInBytes: attachmentSize, durationInSeconds: nil)
attachments = [oneAttachment]
}
}
var uniqueID: String? = itemDictionary["guid"] as? String
if uniqueID == nil {
// Calculate a uniqueID based on a combination of non-empty elements. Then hash the result.
// Items should have guids. When they don't, re-runs are very likely
// because there's no other 100% reliable way to determine identity.
// This calculated uniqueID is valid only for this particular feed. (Just like ids in JSON Feed.)
var s = ""
if let datePublished = datePublished {
s += "\(datePublished.timeIntervalSince1970)"
}
if let title = title {
s += title
}
if let externalURL = externalURL {
s += externalURL
}
if let authorEmailAddress = authorEmailAddress {
s += authorEmailAddress
}
if let oneAttachmentURL = attachments?.first?.url {
s += oneAttachmentURL
}
if s.isEmpty {
// Sheesh. Tough case.
if contentHTML != nil {
s = contentHTML
}
if contentText != nil {
s = contentText
}
}
uniqueID = (s as NSString).rsxml_md5HashString()
}
return ParsedItem(uniqueID: uniqueID, url: nil, externalURL: externalURL, title: title, contentHTML: contentHTML, contentText: contentText, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: nil, authors: authors, tags: tags, attachments: attachments)
}
}

View File

@ -15,4 +15,13 @@ public struct ParsedAttachment {
public let title: String?
public let sizeInBytes: Int?
public let durationInSeconds: Int?
init(url: String?, mimeType: String?, title: String?, sizeInBytes: Int?, durationInSeconds: Int?) {
self.url = url
self.mimeType = mimeType
self.title = title
self.sizeInBytes = sizeInBytes
self.durationInSeconds = durationInSeconds
}
}

View File

@ -14,4 +14,12 @@ public struct ParsedAuthor {
public let url: String?
public let avatarURL: String?
public let emailAddress: String?
init(name: String?, url: String?, avatarURL: String?, emailAddress: String?) {
self.name = name
self.url = url
self.avatarURL = avatarURL
self.emailAddress = emailAddress
}
}

View File

@ -22,4 +22,21 @@ public struct ParsedFeed {
public let expired: Bool
public let hubs: [ParsedHub]?
public let items: [ParsedItem]
init(type: FeedType, title: String?, homePageURL: String?, feedURL: String?, feedDescription: String?, nextURL: String?, iconURL: String?, faviconURL: String?, authors: [ParsedAuthor]?, expired: Bool, hubs: [ParsedHub]?, items:[ParsedItem]) {
self.type = type
self.title = title
self.homePageURL = homePageURL
self.feedURL = feedURL
self.feedDescription = feedDescription
self.nextURL = nextURL
self.iconURL = iconURL
self.faviconURL = faviconURL
self.authors = authors
self.expired = expired
self.hubs = hubs
self.items = items
}
}

View File

@ -24,4 +24,23 @@ public struct ParsedItem {
public let authors: [ParsedAuthor]?
public let tags: [String]?
public let attachments: [ParsedAttachment]?
init(uniqueID: String?, url: String?, externalURL: String?, title: String?, contentHTML: String?, contentText: String?, summary: String?, imageURL: String?, bannerImageURL: String?, datePublished: Date?, dateModified: Date?, authors: [ParsedAuthor]?, tags: [String]?, attachments: [ParsedAttachment]?) {
self.uniqueID = uniqueID
self.url = url
self.externalURL = externalURL
self.title = title
self.contentHTML = contentHTML
self.contentText = contentText
self.summary = summary
self.imageURL = imageURL
self.bannerImageURL = bannerImageURL
self.datePublished = datePublished
self.dateModified = dateModified
self.authors = authors
self.tags = tags
self.attachments = attachments
}
}

View File

@ -8,6 +8,8 @@
@import Foundation;
#import <RSParser/NSData+RSParser.h>
#import <RSParser/RSParser.h>
//#import <RSXML/RSSAXParser.h>
//#import <RSXML/RSXMLData.h>

View File

@ -45,6 +45,11 @@
84469D2F1EFA3134004A6B28 /* RSRSSParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 84469D251EFA3134004A6B28 /* RSRSSParser.h */; };
84469D301EFA3134004A6B28 /* RSRSSParser.m in Sources */ = {isa = PBXBuildFile; fileRef = 84469D261EFA3134004A6B28 /* RSRSSParser.m */; };
84469D321EFA31CF004A6B28 /* FeedParser.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84469D311EFA31CF004A6B28 /* FeedParser.swift */; };
84469D351EFF1190004A6B28 /* NSData+RSParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 84469D331EFF1190004A6B28 /* NSData+RSParser.h */; };
84469D361EFF1190004A6B28 /* NSData+RSParser.m in Sources */ = {isa = PBXBuildFile; fileRef = 84469D341EFF1190004A6B28 /* NSData+RSParser.m */; };
84469D381EFF2645004A6B28 /* RSSInJSONParser.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84469D371EFF2645004A6B28 /* RSSInJSONParser.swift */; };
84469D401EFF29A9004A6B28 /* FeedParserError.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84469D3F1EFF29A9004A6B28 /* FeedParserError.swift */; };
84469D421EFF2B2D004A6B28 /* JSONTypes.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84469D411EFF2B2D004A6B28 /* JSONTypes.swift */; };
84D81BDC1EFA28E700652332 /* RSParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 84D81BDA1EFA28E700652332 /* RSParser.h */; settings = {ATTRIBUTES = (Public, ); }; };
84D81BDE1EFA2B7D00652332 /* ParsedFeed.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84D81BDD1EFA2B7D00652332 /* ParsedFeed.swift */; };
84D81BE01EFA2BAE00652332 /* FeedType.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84D81BDF1EFA2BAE00652332 /* FeedType.swift */; };
@ -105,6 +110,11 @@
84469D251EFA3134004A6B28 /* RSRSSParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RSRSSParser.h; sourceTree = "<group>"; };
84469D261EFA3134004A6B28 /* RSRSSParser.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RSRSSParser.m; sourceTree = "<group>"; };
84469D311EFA31CF004A6B28 /* FeedParser.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = FeedParser.swift; path = Feeds/FeedParser.swift; sourceTree = "<group>"; };
84469D331EFF1190004A6B28 /* NSData+RSParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "NSData+RSParser.h"; sourceTree = "<group>"; };
84469D341EFF1190004A6B28 /* NSData+RSParser.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "NSData+RSParser.m"; sourceTree = "<group>"; };
84469D371EFF2645004A6B28 /* RSSInJSONParser.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = RSSInJSONParser.swift; path = Feeds/JSON/RSSInJSONParser.swift; sourceTree = "<group>"; };
84469D3F1EFF29A9004A6B28 /* FeedParserError.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = FeedParserError.swift; path = Feeds/FeedParserError.swift; sourceTree = "<group>"; };
84469D411EFF2B2D004A6B28 /* JSONTypes.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = JSONTypes.swift; path = Feeds/JSON/JSONTypes.swift; sourceTree = "<group>"; };
84D81BD91EFA28E700652332 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
84D81BDA1EFA28E700652332 /* RSParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RSParser.h; sourceTree = "<group>"; };
84D81BDD1EFA2B7D00652332 /* ParsedFeed.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = ParsedFeed.swift; path = Feeds/ParsedFeed.swift; sourceTree = "<group>"; };
@ -148,6 +158,8 @@
84D81BE31EFA2D3D00652332 /* ParsedItem.swift */,
84D81BE51EFA2DFB00652332 /* ParsedAttachment.swift */,
84D81BE71EFA2E6700652332 /* ParsedHub.swift */,
84469D3F1EFF29A9004A6B28 /* FeedParserError.swift */,
84469D391EFF2649004A6B28 /* JSON */,
84469D1C1EFA3134004A6B28 /* XML */,
);
name = Feeds;
@ -197,6 +209,8 @@
84469D0F1EFA30A2004A6B28 /* Utilities */ = {
isa = PBXGroup;
children = (
84469D331EFF1190004A6B28 /* NSData+RSParser.h */,
84469D341EFF1190004A6B28 /* NSData+RSParser.m */,
84469D101EFA30A2004A6B28 /* NSString+RSXML.h */,
84469D111EFA30A2004A6B28 /* NSString+RSXML.m */,
84469D121EFA30A2004A6B28 /* RSDateParser.h */,
@ -225,6 +239,15 @@
path = Feeds/XML;
sourceTree = "<group>";
};
84469D391EFF2649004A6B28 /* JSON */ = {
isa = PBXGroup;
children = (
84469D411EFF2B2D004A6B28 /* JSONTypes.swift */,
84469D371EFF2645004A6B28 /* RSSInJSONParser.swift */,
);
name = JSON;
sourceTree = "<group>";
};
84FF5F7A1EFA285800C15A01 = {
isa = PBXGroup;
children = (
@ -273,6 +296,7 @@
84469D2D1EFA3134004A6B28 /* RSParsedFeed.h in Headers */,
84469D181EFA30A2004A6B28 /* RSDateParser.h in Headers */,
84469D1A1EFA30A2004A6B28 /* RSXMLInternal.h in Headers */,
84469D351EFF1190004A6B28 /* NSData+RSParser.h in Headers */,
84D81BDC1EFA28E700652332 /* RSParser.h in Headers */,
84469D0B1EFA307E004A6B28 /* RSHTMLMetadataParser.h in Headers */,
84469CFC1EFA3069004A6B28 /* RSSAXParser.h in Headers */,
@ -392,6 +416,7 @@
84469D0E1EFA307E004A6B28 /* RSSAXHTMLParser.m in Sources */,
84469CF41EFA3000004A6B28 /* RSOPMLFeedSpecifier.m in Sources */,
84469CF01EFA3000004A6B28 /* RSOPMLAttributes.m in Sources */,
84469D381EFF2645004A6B28 /* RSSInJSONParser.swift in Sources */,
84469D301EFA3134004A6B28 /* RSRSSParser.m in Sources */,
84469D191EFA30A2004A6B28 /* RSDateParser.m in Sources */,
84469CFD1EFA3069004A6B28 /* RSSAXParser.m in Sources */,
@ -400,14 +425,17 @@
84469CF61EFA3000004A6B28 /* RSOPMLItem.m in Sources */,
84469D2A1EFA3134004A6B28 /* RSFeedParser.m in Sources */,
84D81BE41EFA2D3D00652332 /* ParsedItem.swift in Sources */,
84469D421EFF2B2D004A6B28 /* JSONTypes.swift in Sources */,
84469D0C1EFA307E004A6B28 /* RSHTMLMetadataParser.m in Sources */,
84469D0A1EFA307E004A6B28 /* RSHTMLMetadata.m in Sources */,
84469D171EFA30A2004A6B28 /* NSString+RSXML.m in Sources */,
84469D2C1EFA3134004A6B28 /* RSParsedArticle.m in Sources */,
84469D2E1EFA3134004A6B28 /* RSParsedFeed.m in Sources */,
84469CF81EFA3000004A6B28 /* RSOPMLParser.m in Sources */,
84469D401EFF29A9004A6B28 /* FeedParserError.swift in Sources */,
84469D321EFA31CF004A6B28 /* FeedParser.swift in Sources */,
84469D281EFA3134004A6B28 /* RSAtomParser.m in Sources */,
84469D361EFF1190004A6B28 /* NSData+RSParser.m in Sources */,
84D81BE61EFA2DFB00652332 /* ParsedAttachment.swift in Sources */,
84D81BDE1EFA2B7D00652332 /* ParsedFeed.swift in Sources */,
84D81BE81EFA2E6700652332 /* ParsedHub.swift in Sources */,

View File

@ -0,0 +1,21 @@
//
// NSData+RSParser.h
// RSParser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
@import Foundation;
@interface NSData (RSParser)
- (BOOL)isProbablyHTML;
- (BOOL)isProbablyXML;
- (BOOL)isProbablyJSON;
@end

View File

@ -0,0 +1,132 @@
//
// NSData+RSParser.m
// RSParser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
#import "NSData+RSParser.h"
/* TODO: find real-world cases where the isProbably* cases fail when they should succeed, and add them to tests.*/
static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes);
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes);
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes);
@implementation NSData (RSParser)
- (BOOL)isProbablyHTML {
return bytesAreProbablyHTML(self.bytes, self.length);
}
- (BOOL)isProbablyXML {
return bytesAreProbablyXML(self.bytes, self.length);
}
- (BOOL)isProbablyJSON {
return bytesStartWithStringIgnoringWhitespace("{", self.bytes, self.length);
}
- (BOOL)isProbablyJSONFeed {
if (![self isProbablyJSON]) {
return NO;
}
return didFindString("https://jsonfeed.org/version/", self.bytes, self.length);
}
- (BOOL)isProbablyRSSInJSONFeed {
if (![self isProbablyJSON]) {
return NO;
}
const char *bytes = self.bytes;
NSUInteger length = self.length;
return didFindString("rss", bytes, length) && didFindString("channel", bytes, length) && didFindString("item", bytes, length);
}
- (BOOL)isProbablyRSS {
if (![self isProbablyXML]) {
return NO;
}
return didFindString("<rss", bytes, length);
}
- (BOOL)isProbablyAtom {
if (![self isProbablyXML]) {
return NO;
}
return didFindString("<feed", bytes, length);
}
@end
static BOOL didFindString(const char *string, const char *bytes, NSUInteger numberOfBytes) {
char *foundString = strnstr(bytes, string, numberOfBytes);
return foundString != NULL;
}
static BOOL bytesStartWithStringIgnoringWhitespace(const char *string, const char *bytes, NSUInteger numberOfBytes) {
NSUInteger i = 0;
for (i = 0; i < numberOfBytes; i++) {
const char *ch = bytes[i];
if (ch == ' ' || ch = '\r' || ch == '\n' || ch == '\t') {
continue;
}
if (ch == string[0]) {
return strnstr(bytes, string, numberOfBytes) == bytes + i;
}
break;
}
return NO;
}
static BOOL bytesAreProbablyHTML(const char *bytes, NSUInteger numberOfBytes) {
if (didFindString("<html", bytes, numberOfBytes)) {
return YES;
}
if (didFindString("<HTML", bytes, numberOfBytes)) {
return YES;
}
if (didFindString("<body", bytes, numberOfBytes)) {
return YES;
}
if (didFindString("<meta", bytes, numberOfBytes)) {
return YES;
}
if (didFindString("<", bytes, numberOfBytes) {
if (didFindString("doctype html", bytes, numberOfBytes)) {
return YES;
}
if (didFindString("DOCTYPE html", bytes, numberOfBytes)) {
return YES;
}
if (didFindString("DOCTYPE HTML", bytes, numberOfBytes)) {
return YES;
}
}
return NO;
}
static BOOL bytesAreProbablyXML(const char *bytes, NSUInteger numberOfBytes) {
return bytesStartWithStringIgnoringWhiteSpace("<?xml", bytes, numberOfBytes);
}