1
0
mirror of https://github.com/comatory/fb2iCal synced 2025-06-05 22:09:25 +02:00
Files
Facebook-Events-iCal-Converter/lib/services/dom-parser.js
Ondřej Synáček ae700330b9 fix: do not attempt to parse non-existent FB pages
When user enters something invalid (like number 123), it wuold still hit
FB webpage and return HTML. I do simple detection by looking at title
when it attempts to parse the DOM directly. When it contains "Content
Not Found", it is skipped.

Non-existent web pages cannot be parsed by using LDJSON parser and
looking at null data.
2020-12-26 21:03:03 +01:00

92 lines
2.5 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const cheerio = require('cheerio')
const dayjs = require('dayjs')
const { parseDates } = require('../parser-utils')
const TITLE_BLACKLIST = [
'Content Not Found',
]
const parseDate = (timeText = '') => {
const parts = timeText.split('at')
const datePart = parts[0] || null
const timePart = parts[1] || null
const rangeTimeParts = timePart ? timePart.split('') : []
const startTimePart = `${datePart || ''}${rangeTimeParts[0] || ''}`
const endTimePart = `${datePart || ''}${rangeTimeParts[1] || ''}`
const startTime = startTimePart ?
dayjs(startTimePart) :
dayjs(new Date())
const endTime = dayjs(endTimePart)
const normalizedStartTime = startTime.isValid() ? startTime : dayjs(new Date())
const normalizedEndTime = endTime.isValid() ? endTime : dayjs(new Date())
const { start, duration } = parseDates(normalizedStartTime, normalizedEndTime)
const minimumDuration = { ...duration, minutes: duration.minutes || 120 }
return {
start,
duration: minimumDuration,
}
}
const createLocationData = (streetText, areaText) => {
const location = ([ streetText, areaText ])
.filter(i => i)
.join(', ') || ''
return location.replace(/\r?\n|\r/g, ' ')
}
// NOTE: Fallback parser
// Attempt reading event data directly from DOM
const parseUsingDOM = (html, { logger }) => {
if (logger) {
logger.log({
message: 'Using fallback DOM parser',
level: 'info',
service: 'parser',
})
}
const $ = cheerio.load(html)
const titleText = $('title').text()
const title = TITLE_BLACKLIST.includes(titleText) ? null : titleText
const $eventSummary = $('#event_summary')
const $eventNode = $eventSummary ? $eventSummary.children()[1] : null
const $timeNode = $eventNode ? $eventNode.childNodes[0] : null
const $locationNode = $eventNode ? $eventNode.childNodes[1] : null
const timeText = $timeNode ? $timeNode.attribs.title : ''
const $locationBlock = $locationNode ? $($locationNode).find('td') : null
const $locationBlockTDs = $locationBlock ? $locationBlock.children() : []
const $streetBlock = $locationBlockTDs[1] || null
const $areaBlock = $locationBlockTDs[2] || null
const streetText = $streetBlock ? $($streetBlock).text() : ''
const areaText = $areaBlock ? $($areaBlock).text() : ''
const location = createLocationData(streetText, areaText)
const { start, duration } = parseDate(timeText)
const eventData = {
location,
start,
duration,
title,
}
if (!eventData.title || !eventData.start) {
return null
}
return eventData
}
module.exports = parseUsingDOM