From ae700330b92da0c3dff91bb1685896cf0fc8b898 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Syn=C3=A1=C4=8Dek?=
 <ondrejsynacek@fastmail.com>
Date: Sat, 26 Dec 2020 21:03:03 +0100
Subject: [PATCH] fix: do not attempt to parse non-existent FB pages

When user enters something invalid (like number 123), it wuold still hit
FB webpage and return HTML. I do simple detection by looking at title
when it attempts to parse the DOM directly. When it contains "Content
Not Found", it is skipped.

Non-existent web pages cannot be parsed by using LDJSON parser and
looking at null data.
---
 lib/services/dom-parser.js       |  7 ++++++-
 test/services/dom-parser.spec.js | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/lib/services/dom-parser.js b/lib/services/dom-parser.js
index 666210f..808ac1f 100644
--- a/lib/services/dom-parser.js
+++ b/lib/services/dom-parser.js
@@ -2,6 +2,10 @@ const cheerio = require('cheerio')
 const dayjs = require('dayjs')
 const { parseDates } = require('../parser-utils')
 
+const TITLE_BLACKLIST = [
+  'Content Not Found',
+]
+
 const parseDate = (timeText = '') => {
   const parts = timeText.split('at')
   const datePart = parts[0] || null
@@ -48,7 +52,8 @@ const parseUsingDOM = (html, { logger }) => {
   }
 
   const $ = cheerio.load(html)
-  const title = $('title').text()
+  const titleText = $('title').text()
+  const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText
 
   const $eventSummary = $('#event_summary')
   const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
diff --git a/test/services/dom-parser.spec.js b/test/services/dom-parser.spec.js
index d110c71..05c9f48 100644
--- a/test/services/dom-parser.spec.js
+++ b/test/services/dom-parser.spec.js
@@ -258,6 +258,20 @@ describe(parseUsingDOM, () => {
     })
 
 
+    it('should return null if title was blacklisted', () => {
+      const html = `
+        <html>
+          <head>
+            <title>Content Not Found</title>
+          </head>
+        </html>
+      `
+      const eventData = parseUsingDOM(html, { logger })
+
+      expect(eventData).to.be.null
+    })
+
+
     it('should NOT return start time without title', () => {
       const html = `
         <html>