2017-05-22 22:27:54 +02:00
|
|
|
|
//
|
|
|
|
|
// HTMLFeedFinder.swift
|
2018-06-21 22:18:28 +02:00
|
|
|
|
// FeedFinder
|
2017-05-22 22:27:54 +02:00
|
|
|
|
//
|
|
|
|
|
// Created by Brent Simmons on 8/7/16.
|
2017-05-29 22:17:58 +02:00
|
|
|
|
// Copyright © 2016 Ranchero Software, LLC. All rights reserved.
|
2017-05-22 22:27:54 +02:00
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
import Foundation
|
2017-07-02 02:22:19 +02:00
|
|
|
|
import RSParser
|
2017-05-22 22:27:54 +02:00
|
|
|
|
|
2017-11-26 01:34:20 +01:00
|
|
|
|
private let feedURLWordsToMatch = ["feed", "xml", "rss", "atom", "json"]
|
2017-05-22 22:27:54 +02:00
|
|
|
|
|
|
|
|
|
class HTMLFeedFinder {
|
2018-02-14 22:14:25 +01:00
|
|
|
|
|
2017-05-22 22:27:54 +02:00
|
|
|
|
var feedSpecifiers: Set<FeedSpecifier> {
|
2018-02-14 22:14:25 +01:00
|
|
|
|
return Set(feedSpecifiersDictionary.values)
|
2017-05-22 22:27:54 +02:00
|
|
|
|
}
|
2018-02-14 22:14:25 +01:00
|
|
|
|
|
2019-02-10 07:22:12 +01:00
|
|
|
|
private var feedSpecifiersDictionary = [String: FeedSpecifier]()
|
2018-02-14 22:14:25 +01:00
|
|
|
|
|
2017-07-02 02:22:19 +02:00
|
|
|
|
init(parserData: ParserData) {
|
2018-02-14 22:14:25 +01:00
|
|
|
|
|
2017-07-02 02:22:19 +02:00
|
|
|
|
let metadata = RSHTMLMetadataParser.htmlMetadata(with: parserData)
|
2017-05-22 22:27:54 +02:00
|
|
|
|
|
|
|
|
|
for oneFeedLink in metadata.feedLinks {
|
|
|
|
|
if let oneURLString = oneFeedLink.urlString {
|
|
|
|
|
let oneFeedSpecifier = FeedSpecifier(title: oneFeedLink.title, urlString: oneURLString, source: .HTMLHead)
|
|
|
|
|
addFeedSpecifier(oneFeedSpecifier)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-02 02:22:19 +02:00
|
|
|
|
if let bodyLinks = RSHTMLLinkParser.htmlLinks(with: parserData) {
|
2017-05-22 22:27:54 +02:00
|
|
|
|
for oneBodyLink in bodyLinks {
|
|
|
|
|
|
|
|
|
|
if linkMightBeFeed(oneBodyLink) {
|
2018-09-07 20:46:00 +02:00
|
|
|
|
let normalizedURL = oneBodyLink.urlString.rs_normalizedURL()
|
2018-09-07 03:37:47 +02:00
|
|
|
|
let oneFeedSpecifier = FeedSpecifier(title: oneBodyLink.text, urlString: normalizedURL, source: .HTMLLink)
|
2017-05-22 22:27:54 +02:00
|
|
|
|
addFeedSpecifier(oneFeedSpecifier)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private extension HTMLFeedFinder {
|
|
|
|
|
|
|
|
|
|
func addFeedSpecifier(_ feedSpecifier: FeedSpecifier) {
|
|
|
|
|
|
|
|
|
|
// If there’s an existing feed specifier, merge the two so that we have the best data. If one has a title and one doesn’t, use that non-nil title. Use the better source.
|
|
|
|
|
|
|
|
|
|
if let existingFeedSpecifier = feedSpecifiersDictionary[feedSpecifier.urlString] {
|
|
|
|
|
let mergedFeedSpecifier = existingFeedSpecifier.feedSpecifierByMerging(feedSpecifier)
|
|
|
|
|
feedSpecifiersDictionary[feedSpecifier.urlString] = mergedFeedSpecifier
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
feedSpecifiersDictionary[feedSpecifier.urlString] = feedSpecifier
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func urlStringMightBeFeed(_ urlString: String) -> Bool {
|
|
|
|
|
|
|
|
|
|
let massagedURLString = urlString.replacingOccurrences(of: "buzzfeed", with: "_")
|
|
|
|
|
|
|
|
|
|
for oneMatch in feedURLWordsToMatch {
|
|
|
|
|
let range = (massagedURLString as NSString).range(of: oneMatch, options: .caseInsensitive)
|
|
|
|
|
if range.length > 0 {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func linkMightBeFeed(_ link: RSHTMLLink) -> Bool {
|
|
|
|
|
|
|
|
|
|
if let linkURLString = link.urlString, urlStringMightBeFeed(linkURLString) {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
}
|