diff --git a/package-lock.json b/package-lock.json index 04b122d5f..93c8ed8ee 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,6 +27,7 @@ "form-data": "^4.0.0", "google-translate-api-browser": "^3.0.1", "gpt3-tokenizer": "^1.1.5", + "he": "^1.2.0", "helmet": "^7.1.0", "ip-matching": "^2.1.2", "ipaddr.js": "^2.0.1", @@ -2800,6 +2801,14 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "bin": { + "he": "bin/he" + } + }, "node_modules/helmet": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/helmet/-/helmet-7.1.0.tgz", diff --git a/package.json b/package.json index 8f84c95b9..c40d267e4 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "form-data": "^4.0.0", "google-translate-api-browser": "^3.0.1", "gpt3-tokenizer": "^1.1.5", + "he": "^1.2.0", "helmet": "^7.1.0", "ip-matching": "^2.1.2", "ipaddr.js": "^2.0.1", diff --git a/public/scripts/extensions/attachments/youtube-scrape.html b/public/scripts/extensions/attachments/youtube-scrape.html new file mode 100644 index 000000000..298b08159 --- /dev/null +++ b/public/scripts/extensions/attachments/youtube-scrape.html @@ -0,0 +1,20 @@ +
+ + Enter a video URL or ID to download its transcript. + +
+ Examples: +
+ + + + +
diff --git a/public/scripts/scrapers.js b/public/scripts/scrapers.js index 5ac3c5dc5..7d6457ed2 100644 --- a/public/scripts/scrapers.js +++ b/public/scripts/scrapers.js @@ -93,8 +93,8 @@ class WebScraper { * Check if the scraper is available. * @returns {Promise} */ - isAvailable() { - return Promise.resolve(true); + async isAvailable() { + return true; } /** @@ -167,8 +167,8 @@ class FileScraper { * Check if the scraper is available. * @returns {Promise} */ - isAvailable() { - return Promise.resolve(true); + async isAvailable() { + return true; } /** @@ -199,6 +199,10 @@ class FandomScraper { this.iconClass = 'fa-solid fa-fire'; } + /** + * Check if the scraper is available. + * @returns {Promise} + */ async isAvailable() { try { const result = await fetch('/api/plugins/fandom/probe', { @@ -289,6 +293,77 @@ class FandomScraper { } } +/** + * Scrape transcript from a YouTube video. + * @implements {Scraper} + */ +class YouTubeScraper { + constructor() { + this.id = 'youtube'; + this.name = 'YouTube'; + this.description = 'Download a transcript from a YouTube video.'; + this.iconClass = 'fa-solid fa-closed-captioning'; + } + + /** + * Check if the scraper is available. + * @returns {Promise} + */ + async isAvailable() { + return true; + } + + /** + * Parse the ID of a YouTube video from a URL. + * @param {string} url URL of the YouTube video + * @returns {string} ID of the YouTube video + */ + parseId(url){ + const regex = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/|shorts\/)|(?:(?:watch)?\?v(?:i)?=|&v(?:i)?=))([^#&?]*).*/; + const match = url.match(regex); + return (match?.length && match[1] ? match[1] : url); + } + + /** + * Scrape transcript from a YouTube video. + * @returns {Promise} File attachments scraped from the YouTube video + */ + async scrape() { + let lang = ''; + const template = $(await renderExtensionTemplateAsync('attachments', 'youtube-scrape', {})); + const videoUrl = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 2 }); + + template.find('input[name="youtubeLanguageCode"]').on('input', function () { + lang = String($(this).val()).trim(); + }); + + if (!videoUrl) { + return; + } + + const id = this.parseId(String(videoUrl).trim()); + const toast = toastr.info('Working, please wait...'); + + const result = await fetch('/api/serpapi/transcript', { + method: 'POST', + headers: getRequestHeaders(), + body: JSON.stringify({ id, lang }), + }); + + if (!result.ok) { + const error = await result.text(); + throw new Error(error); + } + + const transcript = await result.text(); + toastr.clear(toast); + + const file = new File([transcript], `YouTube - ${id} - ${Date.now()}.txt`, { type: 'text/plain' }); + return [file]; + } +} + ScraperManager.registerDataBankScraper(new FileScraper()); ScraperManager.registerDataBankScraper(new WebScraper()); ScraperManager.registerDataBankScraper(new FandomScraper()); +ScraperManager.registerDataBankScraper(new YouTubeScraper()); diff --git a/src/endpoints/serpapi.js b/src/endpoints/serpapi.js index faae11750..15d7d0e3c 100644 --- a/src/endpoints/serpapi.js +++ b/src/endpoints/serpapi.js @@ -48,6 +48,92 @@ router.post('/search', jsonParser, async (request, response) => { } }); +/** + * Get the transcript of a YouTube video + * @copyright https://github.com/Kakulukian/youtube-transcript (MIT License) + */ +router.post('/transcript', jsonParser, async (request, response) => { + try { + const he = require('he'); + const RE_XML_TRANSCRIPT = /([^<]*)<\/text>/g; + const id = request.body.id; + const lang = request.body.lang; + + if (!id) { + console.log('Id is required for /transcript'); + return response.sendStatus(400); + } + + const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, { + headers: { + ...(lang && { 'Accept-Language': lang }), + 'User-Agent': visitHeaders['User-Agent'], + }, + }); + + const videoPageBody = await videoPageResponse.text(); + const splittedHTML = videoPageBody.split('"captions":'); + + if (splittedHTML.length <= 1) { + if (videoPageBody.includes('class="g-recaptcha"')) { + throw new Error('Too many requests'); + } + if (!videoPageBody.includes('"playabilityStatus":')) { + throw new Error('Video is not available'); + } + throw new Error('Transcript not available'); + } + + const captions = (() => { + try { + return JSON.parse(splittedHTML[1].split(',"videoDetails')[0].replace('\n', '')); + } catch (e) { + return undefined; + } + })()?.['playerCaptionsTracklistRenderer']; + + if (!captions) { + throw new Error('Transcript disabled'); + } + + if (!('captionTracks' in captions)) { + throw new Error('Transcript not available'); + } + + if (lang && !captions.captionTracks.some(track => track.languageCode === lang)) { + throw new Error('Transcript not available in this language'); + } + + const transcriptURL = (lang ? captions.captionTracks.find(track => track.languageCode === lang) : captions.captionTracks[0]).baseUrl; + const transcriptResponse = await fetch(transcriptURL, { + headers: { + ...(lang && { 'Accept-Language': lang }), + 'User-Agent': visitHeaders['User-Agent'], + }, + }); + + if (!transcriptResponse.ok) { + throw new Error('Transcript request failed'); + } + + const transcriptBody = await transcriptResponse.text(); + const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)]; + const transcript = results.map((result) => ({ + text: result[3], + duration: parseFloat(result[2]), + offset: parseFloat(result[1]), + lang: lang ?? captions.captionTracks[0].languageCode, + })); + // The text is double-encoded + const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' '); + + return response.send(transcriptText); + } catch (error) { + console.log(error); + return response.sendStatus(500); + } +}); + router.post('/visit', jsonParser, async (request, response) => { try { const url = request.body.url;