diff --git a/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift index 153cf861b..6e467f7b4 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift @@ -36,7 +36,7 @@ public struct FeedParser { return try JSONFeedParser.parse(parserData) case .rssInJSON: - return nil // TODO: try RSSInJSONParser.parse(parserData) + return try RSSInJSONParser.parse(parserData) case .rss: let feed = RSSParser.parsedFeed(with: parserData) diff --git a/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift index 4bf2ad624..2dcd92313 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift @@ -1,182 +1,183 @@ -//// -//// RSSInJSONParser.swift -//// RSParser -//// -//// Created by Brent Simmons on 6/24/17. -//// Copyright © 2017 Ranchero Software, LLC. All rights reserved. -//// // -//import Foundation -//import SAX +// RSSInJSONParser.swift +// RSParser // -//// See https://github.com/scripting/Scripting-News/blob/master/rss-in-json/README.md -//// Also: http://cyber.harvard.edu/rss/rss.html +// Created by Brent Simmons on 6/24/17. +// Copyright © 2017 Ranchero Software, LLC. All rights reserved. // -//public struct RSSInJSONParser { -// -// public static func parse(_ parserData: ParserData) throws -> ParsedFeed? { -// -// do { -// guard let parsedObject = try JSONSerialization.jsonObject(with: parserData.data) as? JSONDictionary else { -// throw FeedParserError(.invalidJSON) -// } -// guard let rssObject = parsedObject["rss"] as? JSONDictionary else { -// throw FeedParserError(.rssChannelNotFound) -// } -// guard let channelObject = rssObject["channel"] as? JSONDictionary else { -// throw FeedParserError(.rssChannelNotFound) -// } -// -// // I’d bet money that in practice the items array won’t always appear correctly inside the channel object. -// // I’d also bet that sometimes it gets called "items" instead of "item". -// var itemsObject = channelObject["item"] as? JSONArray -// if itemsObject == nil { -// itemsObject = parsedObject["item"] as? JSONArray -// } -// if itemsObject == nil { -// itemsObject = channelObject["items"] as? JSONArray -// } -// if itemsObject == nil { -// itemsObject = parsedObject["items"] as? JSONArray -// } -// if itemsObject == nil { -// throw FeedParserError(.rssItemsNotFound) -// } -// -// let title = channelObject["title"] as? String -// let homePageURL = channelObject["link"] as? String -// let feedURL = parserData.url -// let feedDescription = channelObject["description"] as? String -// let feedLanguage = channelObject["language"] as? String -// -// let items = parseItems(itemsObject!, parserData.url) -// -// return ParsedFeed(type: .rssInJSON, title: title, homePageURL: homePageURL, feedURL: feedURL, language: feedLanguage, feedDescription: feedDescription, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items) -// -// } -// catch { throw error } -// } -//} -// -//private extension RSSInJSONParser { -// -// static func parseItems(_ itemsObject: JSONArray, _ feedURL: String) -> Set { -// -// return Set(itemsObject.compactMap{ (oneItemDictionary) -> ParsedItem? in -// -// return parsedItemWithDictionary(oneItemDictionary, feedURL) -// }) -// } -// -// static func parsedItemWithDictionary(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? { -// -// let externalURL = itemDictionary["link"] as? String -// let title = itemDictionary["title"] as? String -// -// var contentHTML = itemDictionary["description"] as? String -// var contentText: String? = nil -// if contentHTML != nil && !(contentHTML!.contains("<")) { -// contentText = contentHTML -// contentHTML = nil -// } -// if contentHTML == nil && contentText == nil && title == nil { -// return nil -// } -// -// var datePublished: Date? = nil -// if let datePublishedString = itemDictionary["pubDate"] as? String { -// datePublished = RSDateWithString(datePublishedString) -// } -// -// let authors = parseAuthors(itemDictionary) -// let tags = parseTags(itemDictionary) -// let attachments = parseAttachments(itemDictionary) -// -// var uniqueID: String? = itemDictionary["guid"] as? String -// if uniqueID == nil { -// -// // Calculate a uniqueID based on a combination of non-empty elements. Then hash the result. -// // Items should have guids. When they don't, re-runs are very likely -// // because there's no other 100% reliable way to determine identity. -// // This calculated uniqueID is valid only for this particular feed. (Just like ids in JSON Feed.) -// -// var s = "" -// if let datePublished = datePublished { -// s += "\(datePublished.timeIntervalSince1970)" -// } -// if let title = title { -// s += title -// } -// if let externalURL = externalURL { -// s += externalURL -// } -// if let authorEmailAddress = authors?.first?.emailAddress { -// s += authorEmailAddress -// } -// if let oneAttachmentURL = attachments?.first?.url { -// s += oneAttachmentURL -// } -// if s.isEmpty { -// // Sheesh. Tough case. -// if let _ = contentHTML { -// s = contentHTML! -// } -// if let _ = contentText { -// s = contentText! -// } -// } -// uniqueID = (s as NSString).rsparser_md5Hash() -// } -// -// if let uniqueID = uniqueID { -// return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: nil, externalURL: externalURL, title: title, language: nil, contentHTML: contentHTML, contentText: contentText, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: nil, authors: authors, tags: tags, attachments: attachments) -// } -// return nil -// } -// -// static func parseAuthors(_ itemDictionary: JSONDictionary) -> Set? { -// -// guard let authorEmailAddress = itemDictionary["author"] as? String else { -// return nil -// } -// let parsedAuthor = ParsedAuthor(name: nil, url: nil, avatarURL: nil, emailAddress: authorEmailAddress) -// return Set([parsedAuthor]) -// } -// -// static func parseTags(_ itemDictionary: JSONDictionary) -> Set? { -// -// if let categoryObject = itemDictionary["category"] as? JSONDictionary { -// if let oneTag = categoryObject["#value"] as? String { -// return Set([oneTag]) -// } -// return nil -// } -// else if let categoryArray = itemDictionary["category"] as? JSONArray { -// return Set(categoryArray.compactMap{ $0["#value"] as? String }) -// } -// return nil -// } -// -// static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set? { -// -// guard let enclosureObject = itemDictionary["enclosure"] as? JSONDictionary else { -// return nil -// } -// guard let attachmentURL = enclosureObject["url"] as? String else { -// return nil -// } -// -// var attachmentSize = enclosureObject["length"] as? Int -// if attachmentSize == nil { -// if let attachmentSizeString = enclosureObject["length"] as? String { -// attachmentSize = (attachmentSizeString as NSString).integerValue -// } -// } -// -// let type = enclosureObject["type"] as? String -// if let attachment = ParsedAttachment(url: attachmentURL, mimeType: type, title: nil, sizeInBytes: attachmentSize, durationInSeconds: nil) { -// return Set([attachment]) -// } -// return nil -// } -//} + +import Foundation +import SAX +import DateParser + +// See https://github.com/scripting/Scripting-News/blob/master/rss-in-json/README.md +// Also: http://cyber.harvard.edu/rss/rss.html + +public struct RSSInJSONParser { + + public static func parse(_ parserData: ParserData) throws -> ParsedFeed? { + + do { + guard let parsedObject = try JSONSerialization.jsonObject(with: parserData.data) as? JSONDictionary else { + throw FeedParserError(.invalidJSON) + } + guard let rssObject = parsedObject["rss"] as? JSONDictionary else { + throw FeedParserError(.rssChannelNotFound) + } + guard let channelObject = rssObject["channel"] as? JSONDictionary else { + throw FeedParserError(.rssChannelNotFound) + } + + // I’d bet money that in practice the items array won’t always appear correctly inside the channel object. + // I’d also bet that sometimes it gets called "items" instead of "item". + var itemsObject = channelObject["item"] as? JSONArray + if itemsObject == nil { + itemsObject = parsedObject["item"] as? JSONArray + } + if itemsObject == nil { + itemsObject = channelObject["items"] as? JSONArray + } + if itemsObject == nil { + itemsObject = parsedObject["items"] as? JSONArray + } + if itemsObject == nil { + throw FeedParserError(.rssItemsNotFound) + } + + let title = channelObject["title"] as? String + let homePageURL = channelObject["link"] as? String + let feedURL = parserData.url + let feedDescription = channelObject["description"] as? String + let feedLanguage = channelObject["language"] as? String + + let items = parseItems(itemsObject!, parserData.url) + + return ParsedFeed(type: .rssInJSON, title: title, homePageURL: homePageURL, feedURL: feedURL, language: feedLanguage, feedDescription: feedDescription, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items) + + } + catch { throw error } + } +} + +private extension RSSInJSONParser { + + static func parseItems(_ itemsObject: JSONArray, _ feedURL: String) -> Set { + + return Set(itemsObject.compactMap{ (oneItemDictionary) -> ParsedItem? in + + return parsedItemWithDictionary(oneItemDictionary, feedURL) + }) + } + + static func parsedItemWithDictionary(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? { + + let externalURL = itemDictionary["link"] as? String + let title = itemDictionary["title"] as? String + + var contentHTML = itemDictionary["description"] as? String + var contentText: String? = nil + if contentHTML != nil && !(contentHTML!.contains("<")) { + contentText = contentHTML + contentHTML = nil + } + if contentHTML == nil && contentText == nil && title == nil { + return nil + } + + var datePublished: Date? = nil + if let datePublishedString = itemDictionary["pubDate"] as? String { + datePublished = DateParser.date(string: datePublishedString) + } + + let authors = parseAuthors(itemDictionary) + let tags = parseTags(itemDictionary) + let attachments = parseAttachments(itemDictionary) + + var uniqueID: String? = itemDictionary["guid"] as? String + if uniqueID == nil { + + // Calculate a uniqueID based on a combination of non-empty elements. Then hash the result. + // Items should have guids. When they don't, re-runs are very likely + // because there's no other 100% reliable way to determine identity. + // This calculated uniqueID is valid only for this particular feed. (Just like ids in JSON Feed.) + + var s = "" + if let datePublished = datePublished { + s += "\(datePublished.timeIntervalSince1970)" + } + if let title = title { + s += title + } + if let externalURL = externalURL { + s += externalURL + } + if let authorEmailAddress = authors?.first?.emailAddress { + s += authorEmailAddress + } + if let oneAttachmentURL = attachments?.first?.url { + s += oneAttachmentURL + } + if s.isEmpty { + // Sheesh. Tough case. + if let _ = contentHTML { + s = contentHTML! + } + if let _ = contentText { + s = contentText! + } + } + uniqueID = s.md5String + } + + if let uniqueID = uniqueID { + return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: nil, externalURL: externalURL, title: title, language: nil, contentHTML: contentHTML, contentText: contentText, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: nil, authors: authors, tags: tags, attachments: attachments) + } + return nil + } + + static func parseAuthors(_ itemDictionary: JSONDictionary) -> Set? { + + guard let authorEmailAddress = itemDictionary["author"] as? String else { + return nil + } + let parsedAuthor = ParsedAuthor(name: nil, url: nil, avatarURL: nil, emailAddress: authorEmailAddress) + return Set([parsedAuthor]) + } + + static func parseTags(_ itemDictionary: JSONDictionary) -> Set? { + + if let categoryObject = itemDictionary["category"] as? JSONDictionary { + if let oneTag = categoryObject["#value"] as? String { + return Set([oneTag]) + } + return nil + } + else if let categoryArray = itemDictionary["category"] as? JSONArray { + return Set(categoryArray.compactMap{ $0["#value"] as? String }) + } + return nil + } + + static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set? { + + guard let enclosureObject = itemDictionary["enclosure"] as? JSONDictionary else { + return nil + } + guard let attachmentURL = enclosureObject["url"] as? String else { + return nil + } + + var attachmentSize = enclosureObject["length"] as? Int + if attachmentSize == nil { + if let attachmentSizeString = enclosureObject["length"] as? String { + attachmentSize = (attachmentSizeString as NSString).integerValue + } + } + + let type = enclosureObject["type"] as? String + if let attachment = ParsedAttachment(url: attachmentURL, mimeType: type, title: nil, sizeInBytes: attachmentSize, durationInSeconds: nil) { + return Set([attachment]) + } + return nil + } +} diff --git a/Modules/Parser/Tests/FeedParserTests/RSSInJSONParserTests.swift b/Modules/Parser/Tests/FeedParserTests/RSSInJSONParserTests.swift index df6bef373..4412dd062 100644 --- a/Modules/Parser/Tests/FeedParserTests/RSSInJSONParserTests.swift +++ b/Modules/Parser/Tests/FeedParserTests/RSSInJSONParserTests.swift @@ -9,20 +9,20 @@ import XCTest import FeedParser -//class RSSInJSONParserTests: XCTestCase { -// -// func testScriptingNewsPerformance() { -// -// // 0.003 sec on my 2012 iMac. -// let d = parserData("ScriptingNews", "json", "http://scripting.com/") -// self.measure { -// let _ = try! FeedParser.parseSync(d) -// } -// } -// -// func testFeedLanguage() { -// let d = parserData("ScriptingNews", "json", "http://scripting.com/") -// let parsedFeed = try! FeedParser.parseSync(d)! -// XCTAssertEqual(parsedFeed.language, "en-us") -// } -//} +class RSSInJSONParserTests: XCTestCase { + + func testScriptingNewsPerformance() { + + // 0.003 sec on my 2012 iMac. + let d = parserData("ScriptingNews", "json", "http://scripting.com/") + self.measure { + let _ = try! FeedParser.parse(d) + } + } + + func testFeedLanguage() { + let d = parserData("ScriptingNews", "json", "http://scripting.com/") + let parsedFeed = try! FeedParser.parse(d)! + XCTAssertEqual(parsedFeed.language, "en-us") + } +}