fix: do not attempt to parse non-existent FB pages
When user enters something invalid (like number 123), it wuold still hit FB webpage and return HTML. I do simple detection by looking at title when it attempts to parse the DOM directly. When it contains "Content Not Found", it is skipped. Non-existent web pages cannot be parsed by using LDJSON parser and looking at null data.
This commit is contained in:
parent
0683ef9666
commit
ae700330b9
|
@ -2,6 +2,10 @@ const cheerio = require('cheerio')
|
|||
const dayjs = require('dayjs')
|
||||
const { parseDates } = require('../parser-utils')
|
||||
|
||||
const TITLE_BLACKLIST = [
|
||||
'Content Not Found',
|
||||
]
|
||||
|
||||
const parseDate = (timeText = '') => {
|
||||
const parts = timeText.split('at')
|
||||
const datePart = parts[0] || null
|
||||
|
@ -48,7 +52,8 @@ const parseUsingDOM = (html, { logger }) => {
|
|||
}
|
||||
|
||||
const $ = cheerio.load(html)
|
||||
const title = $('title').text()
|
||||
const titleText = $('title').text()
|
||||
const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText
|
||||
|
||||
const $eventSummary = $('#event_summary')
|
||||
const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
|
||||
|
|
|
@ -258,6 +258,20 @@ describe(parseUsingDOM, () => {
|
|||
})
|
||||
|
||||
|
||||
it('should return null if title was blacklisted', () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<title>Content Not Found</title>
|
||||
</head>
|
||||
</html>
|
||||
`
|
||||
const eventData = parseUsingDOM(html, { logger })
|
||||
|
||||
expect(eventData).to.be.null
|
||||
})
|
||||
|
||||
|
||||
it('should NOT return start time without title', () => {
|
||||
const html = `
|
||||
<html>
|
||||
|
|
Loading…
Reference in New Issue