diff --git a/lib/parser-utils.js b/lib/parser-utils.js new file mode 100644 index 0000000..27a7f34 --- /dev/null +++ b/lib/parser-utils.js @@ -0,0 +1,32 @@ +const dayjs = require('dayjs') + +// NOTE: Specific formatting for `ics` library +const parseDates = (startDate, endDate) => { + const start = startDate ? [ + startDate.year(), + startDate.month() + 1, + startDate.date(), + startDate.hour(), + startDate.minute(), + ] : (() => { + const now = dayjs() + + return [ + now.year(), + now.month() + 1, + now.date() + ] + })() + const diffInMinutes = endDate ? + endDate.diff(startDate, 'minutes') : + 120 + + const duration = { minutes: diffInMinutes } + + return { + start, + duration, + } +} + +module.exports = { parseDates } diff --git a/lib/services/dom-parser.js b/lib/services/dom-parser.js new file mode 100644 index 0000000..b30d84a --- /dev/null +++ b/lib/services/dom-parser.js @@ -0,0 +1,77 @@ +const cheerio = require('cheerio') +const dayjs = require('dayjs') +const { parseDates } = require('../parser-utils') + +const parseDate = (timeText) => { + const parts = timeText.split('at') + const datePart = parts[0] || null + const timePart = parts[1] || null + + if (!datePart) { + return { + start: null, + duration: null, + } + } + + const normalizedTimeString = `${datePart}${timePart || ''}` + const startTime = dayjs(normalizedTimeString) + + const { start, duration } = parseDates(startTime, null) + + return { + start, + duration, + } +} + +const createLocationData = (streetText, areaText) => { + const location = ([ streetText, areaText ]) + .filter(i => i) + .join(', ') || '' + + return location.replace(/\r?\n|\r/g, ' ') +} + +// NOTE: Fallback parser +// Attempt reading event data directly from DOM +const parseUsingDOM = (html, url) => { + console.info('Fallback parser used') + const $ = cheerio.load(html) + const title = $('title').text() + + const $eventSummary = $('#event_summary') + const $eventNode = $eventSummary ? $eventSummary.children()[1] : null + + const $timeNode = $eventNode ? $eventNode.childNodes[0] : null + const $locationNode = $eventNode ? $eventNode.childNodes[1] : null + + const timeText = $timeNode ? $timeNode.attribs.title : '' + + const $locationBlock = $locationNode ? $($locationNode).find('td') : null + const $locationBlockTDs = $locationBlock ? $locationBlock.children() : [] + const $streetBlock = $locationBlockTDs[1] || null + const $areaBlock = $locationBlockTDs[2] || null + + const streetText = $streetBlock ? $($streetBlock).text() : '' + const areaText = $areaBlock ? $($areaBlock).text() : '' + + const location = createLocationData(streetText, areaText) + const { start, duration } = parseDate(timeText) + + const eventData = { + location, + start, + duration, + title, + url, + } + + if (!eventData.title || !eventData.start) { + return null + } + + return eventData +} + +module.exports = parseUsingDOM diff --git a/lib/services/ics-retriever.js b/lib/services/ics-retriever.js index 97daab2..ddabd34 100644 --- a/lib/services/ics-retriever.js +++ b/lib/services/ics-retriever.js @@ -1,13 +1,21 @@ const crawl = require('./crawler') -const parseHTML = require('./parser') +const parseUsingLDJSONData = require('./ldjson-parser') +const parseUsingDOM = require('./dom-parser') const generateICS = require('./ics-generator') -const { getNormalizedUrl } = require('../utils') +const { createParserError, getNormalizedUrl } = require('../utils') const retrieveICS = async (URLparameter) => { try { const url = getNormalizedUrl(URLparameter) const html = await crawl(url) - const eventData = parseHTML(html) + const LDJSONEventData = parseUsingLDJSONData(html) + const eventData = LDJSONEventData || parseUsingDOM(html, url) + + if (!eventData) { + throw createParserError() + return + } + const icsFile = await generateICS(eventData) return icsFile } catch (err) { diff --git a/lib/services/parser.js b/lib/services/ldjson-parser.js similarity index 69% rename from lib/services/parser.js rename to lib/services/ldjson-parser.js index 70eed7d..a948492 100644 --- a/lib/services/parser.js +++ b/lib/services/ldjson-parser.js @@ -1,35 +1,6 @@ -const dayjs = require('dayjs') const cheerio = require('cheerio') -const { createParserError } = require('../utils') - -// NOTE: Specific formatting for `ics` library -const parseDates = (startDate, endDate) => { - const start = startDate ? [ - startDate.year(), - startDate.month() + 1, - startDate.date(), - startDate.hour(), - startDate.minute(), - ] : (() => { - const now = dayjs() - - return [ - now.year(), - now.month() + 1, - now.date() - ] - })() - const diffInMinutes = endDate ? - endDate.diff(startDate, 'minutes') : - 120 - - const duration = { minutes: diffInMinutes } - - return { - start, - duration, - } -} +const dayjs = require('dayjs') +const { parseDates } = require('../parser-utils') const parseEventData = (eventData) => { const startDate = eventData.startDate && dayjs(eventData.startDate) @@ -60,7 +31,7 @@ const parseEventData = (eventData) => { } } -const parseHTML = (html) => { +const parseUsingLDJSONData = (html) => { try { // NOTE: Mobile web should have serialized // event info in one of the script tags @@ -81,7 +52,7 @@ const parseHTML = (html) => { }, null) if (!rawData) { - throw createParserError() + return null } const eventData = JSON.parse(rawData.slice(12, -5)) @@ -93,4 +64,4 @@ const parseHTML = (html) => { } } -module.exports = parseHTML +module.exports = parseUsingLDJSONData diff --git a/lib/utils.js b/lib/utils.js index 7ceeed3..1bf69e4 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -1,3 +1,5 @@ +const dayjs = require('dayjs') + const checkValidURL = (url) => { return checkURLFormat(url) || checkNumberURLParameter(url)