add fallback parser when LD/JSON not found, use DOM parser

This commit is contained in:
Ondrej Synacek 2019-10-23 22:03:20 +02:00
parent a41992d53b
commit 889de553f9
5 changed files with 127 additions and 37 deletions

32
lib/parser-utils.js Normal file
View File

@ -0,0 +1,32 @@
const dayjs = require('dayjs')
// NOTE: Specific formatting for `ics` library
const parseDates = (startDate, endDate) => {
const start = startDate ? [
startDate.year(),
startDate.month() + 1,
startDate.date(),
startDate.hour(),
startDate.minute(),
] : (() => {
const now = dayjs()
return [
now.year(),
now.month() + 1,
now.date()
]
})()
const diffInMinutes = endDate ?
endDate.diff(startDate, 'minutes') :
120
const duration = { minutes: diffInMinutes }
return {
start,
duration,
}
}
module.exports = { parseDates }

View File

@ -0,0 +1,77 @@
const cheerio = require('cheerio')
const dayjs = require('dayjs')
const { parseDates } = require('../parser-utils')
const parseDate = (timeText) => {
const parts = timeText.split('at')
const datePart = parts[0] || null
const timePart = parts[1] || null
if (!datePart) {
return {
start: null,
duration: null,
}
}
const normalizedTimeString = `${datePart}${timePart || ''}`
const startTime = dayjs(normalizedTimeString)
const { start, duration } = parseDates(startTime, null)
return {
start,
duration,
}
}
const createLocationData = (streetText, areaText) => {
const location = ([ streetText, areaText ])
.filter(i => i)
.join(', ') || ''
return location.replace(/\r?\n|\r/g, ' ')
}
// NOTE: Fallback parser
// Attempt reading event data directly from DOM
const parseUsingDOM = (html, url) => {
console.info('Fallback parser used')
const $ = cheerio.load(html)
const title = $('title').text()
const $eventSummary = $('#event_summary')
const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
const $timeNode = $eventNode ? $eventNode.childNodes[0] : null
const $locationNode = $eventNode ? $eventNode.childNodes[1] : null
const timeText = $timeNode ? $timeNode.attribs.title : ''
const $locationBlock = $locationNode ? $($locationNode).find('td') : null
const $locationBlockTDs = $locationBlock ? $locationBlock.children() : []
const $streetBlock = $locationBlockTDs[1] || null
const $areaBlock = $locationBlockTDs[2] || null
const streetText = $streetBlock ? $($streetBlock).text() : ''
const areaText = $areaBlock ? $($areaBlock).text() : ''
const location = createLocationData(streetText, areaText)
const { start, duration } = parseDate(timeText)
const eventData = {
location,
start,
duration,
title,
url,
}
if (!eventData.title || !eventData.start) {
return null
}
return eventData
}
module.exports = parseUsingDOM

View File

@ -1,13 +1,21 @@
const crawl = require('./crawler')
const parseHTML = require('./parser')
const parseUsingLDJSONData = require('./ldjson-parser')
const parseUsingDOM = require('./dom-parser')
const generateICS = require('./ics-generator')
const { getNormalizedUrl } = require('../utils')
const { createParserError, getNormalizedUrl } = require('../utils')
const retrieveICS = async (URLparameter) => {
try {
const url = getNormalizedUrl(URLparameter)
const html = await crawl(url)
const eventData = parseHTML(html)
const LDJSONEventData = parseUsingLDJSONData(html)
const eventData = LDJSONEventData || parseUsingDOM(html, url)
if (!eventData) {
throw createParserError()
return
}
const icsFile = await generateICS(eventData)
return icsFile
} catch (err) {

View File

@ -1,35 +1,6 @@
const dayjs = require('dayjs')
const cheerio = require('cheerio')
const { createParserError } = require('../utils')
// NOTE: Specific formatting for `ics` library
const parseDates = (startDate, endDate) => {
const start = startDate ? [
startDate.year(),
startDate.month() + 1,
startDate.date(),
startDate.hour(),
startDate.minute(),
] : (() => {
const now = dayjs()
return [
now.year(),
now.month() + 1,
now.date()
]
})()
const diffInMinutes = endDate ?
endDate.diff(startDate, 'minutes') :
120
const duration = { minutes: diffInMinutes }
return {
start,
duration,
}
}
const dayjs = require('dayjs')
const { parseDates } = require('../parser-utils')
const parseEventData = (eventData) => {
const startDate = eventData.startDate && dayjs(eventData.startDate)
@ -60,7 +31,7 @@ const parseEventData = (eventData) => {
}
}
const parseHTML = (html) => {
const parseUsingLDJSONData = (html) => {
try {
// NOTE: Mobile web should have serialized
// event info in one of the script tags
@ -81,7 +52,7 @@ const parseHTML = (html) => {
}, null)
if (!rawData) {
throw createParserError()
return null
}
const eventData = JSON.parse(rawData.slice(12, -5))
@ -93,4 +64,4 @@ const parseHTML = (html) => {
}
}
module.exports = parseHTML
module.exports = parseUsingLDJSONData

View File

@ -1,3 +1,5 @@
const dayjs = require('dayjs')
const checkValidURL = (url) => {
return checkURLFormat(url) ||
checkNumberURLParameter(url)