fix: do not attempt to parse non-existent FB pages

When user enters something invalid (like number 123), it wuold still hit
FB webpage and return HTML. I do simple detection by looking at title
when it attempts to parse the DOM directly. When it contains "Content
Not Found", it is skipped.

Non-existent web pages cannot be parsed by using LDJSON parser and
looking at null data.
This commit is contained in:
Ondřej Synáček 2020-12-26 21:03:03 +01:00
parent 0683ef9666
commit ae700330b9
2 changed files with 20 additions and 1 deletions

View File

@ -2,6 +2,10 @@ const cheerio = require('cheerio')
const dayjs = require('dayjs')
const { parseDates } = require('../parser-utils')
const TITLE_BLACKLIST = [
'Content Not Found',
]
const parseDate = (timeText = '') => {
const parts = timeText.split('at')
const datePart = parts[0] || null
@ -48,7 +52,8 @@ const parseUsingDOM = (html, { logger }) => {
}
const $ = cheerio.load(html)
const title = $('title').text()
const titleText = $('title').text()
const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText
const $eventSummary = $('#event_summary')
const $eventNode = $eventSummary ? $eventSummary.children()[1] : null

View File

@ -258,6 +258,20 @@ describe(parseUsingDOM, () => {
})
it('should return null if title was blacklisted', () => {
const html = `
<html>
<head>
<title>Content Not Found</title>
</head>
</html>
`
const eventData = parseUsingDOM(html, { logger })
expect(eventData).to.be.null
})
it('should NOT return start time without title', () => {
const html = `
<html>