fix: do not attempt to parse non-existent FB pages
When user enters something invalid (like number 123), it wuold still hit FB webpage and return HTML. I do simple detection by looking at title when it attempts to parse the DOM directly. When it contains "Content Not Found", it is skipped. Non-existent web pages cannot be parsed by using LDJSON parser and looking at null data.
This commit is contained in:
parent
0683ef9666
commit
ae700330b9
|
@ -2,6 +2,10 @@ const cheerio = require('cheerio')
|
||||||
const dayjs = require('dayjs')
|
const dayjs = require('dayjs')
|
||||||
const { parseDates } = require('../parser-utils')
|
const { parseDates } = require('../parser-utils')
|
||||||
|
|
||||||
|
const TITLE_BLACKLIST = [
|
||||||
|
'Content Not Found',
|
||||||
|
]
|
||||||
|
|
||||||
const parseDate = (timeText = '') => {
|
const parseDate = (timeText = '') => {
|
||||||
const parts = timeText.split('at')
|
const parts = timeText.split('at')
|
||||||
const datePart = parts[0] || null
|
const datePart = parts[0] || null
|
||||||
|
@ -48,7 +52,8 @@ const parseUsingDOM = (html, { logger }) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const $ = cheerio.load(html)
|
const $ = cheerio.load(html)
|
||||||
const title = $('title').text()
|
const titleText = $('title').text()
|
||||||
|
const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText
|
||||||
|
|
||||||
const $eventSummary = $('#event_summary')
|
const $eventSummary = $('#event_summary')
|
||||||
const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
|
const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
|
||||||
|
|
|
@ -258,6 +258,20 @@ describe(parseUsingDOM, () => {
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
it('should return null if title was blacklisted', () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Content Not Found</title>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
`
|
||||||
|
const eventData = parseUsingDOM(html, { logger })
|
||||||
|
|
||||||
|
expect(eventData).to.be.null
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
it('should NOT return start time without title', () => {
|
it('should NOT return start time without title', () => {
|
||||||
const html = `
|
const html = `
|
||||||
<html>
|
<html>
|
||||||
|
|
Loading…
Reference in New Issue