From ae700330b92da0c3dff91bb1685896cf0fc8b898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Syn=C3=A1=C4=8Dek?= Date: Sat, 26 Dec 2020 21:03:03 +0100 Subject: [PATCH] fix: do not attempt to parse non-existent FB pages When user enters something invalid (like number 123), it wuold still hit FB webpage and return HTML. I do simple detection by looking at title when it attempts to parse the DOM directly. When it contains "Content Not Found", it is skipped. Non-existent web pages cannot be parsed by using LDJSON parser and looking at null data. --- lib/services/dom-parser.js | 7 ++++++- test/services/dom-parser.spec.js | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/services/dom-parser.js b/lib/services/dom-parser.js index 666210f..808ac1f 100644 --- a/lib/services/dom-parser.js +++ b/lib/services/dom-parser.js @@ -2,6 +2,10 @@ const cheerio = require('cheerio') const dayjs = require('dayjs') const { parseDates } = require('../parser-utils') +const TITLE_BLACKLIST = [ + 'Content Not Found', +] + const parseDate = (timeText = '') => { const parts = timeText.split('at') const datePart = parts[0] || null @@ -48,7 +52,8 @@ const parseUsingDOM = (html, { logger }) => { } const $ = cheerio.load(html) - const title = $('title').text() + const titleText = $('title').text() + const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText const $eventSummary = $('#event_summary') const $eventNode = $eventSummary ? $eventSummary.children()[1] : null diff --git a/test/services/dom-parser.spec.js b/test/services/dom-parser.spec.js index d110c71..05c9f48 100644 --- a/test/services/dom-parser.spec.js +++ b/test/services/dom-parser.spec.js @@ -258,6 +258,20 @@ describe(parseUsingDOM, () => { }) + it('should return null if title was blacklisted', () => { + const html = ` + + + Content Not Found + + + ` + const eventData = parseUsingDOM(html, { logger }) + + expect(eventData).to.be.null + }) + + it('should NOT return start time without title', () => { const html = `