fix: do not attempt to parse non-existent FB pages

When user enters something invalid (like number 123), it wuold still hit FB webpage and return HTML. I do simple detection by looking at title when it attempts to parse the DOM directly. When it contains "Content Not Found", it is skipped. Non-existent web pages cannot be parsed by using LDJSON parser and looking at null data.
2020-12-26 21:03:03 +01:00 · 2020-12-26 21:03:03 +01:00 · ae700330b9
parent 0683ef9666
commit ae700330b9
2 changed files with 20 additions and 1 deletions
--- a/lib/services/dom-parser.js
+++ b/lib/services/dom-parser.js
@ -2,6 +2,10 @@ const cheerio = require('cheerio')
 const dayjs = require('dayjs')
 const { parseDates } = require('../parser-utils')

+const TITLE_BLACKLIST = [
+  'Content Not Found',
+]
+
 const parseDate = (timeText = '') => {
  const parts = timeText.split('at')
  const datePart = parts[0] || null
@ -48,7 +52,8 @@ const parseUsingDOM = (html, { logger }) => {
  }

  const $ = cheerio.load(html)
-  const title = $('title').text()
+  const titleText = $('title').text()
+  const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText

  const $eventSummary = $('#event_summary')
  const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
--- a/test/services/dom-parser.spec.js
+++ b/test/services/dom-parser.spec.js
@ -258,6 +258,20 @@ describe(parseUsingDOM, () => {
    })


+    it('should return null if title was blacklisted', () => {
+      const html = `
+        <html>
+          <head>
+            <title>Content Not Found</title>
+          </head>
+        </html>
+      `
+      const eventData = parseUsingDOM(html, { logger })
+
+      expect(eventData).to.be.null
+    })
+
+
    it('should NOT return start time without title', () => {
      const html = `
        <html>