Add YT script loader for data bank

2025-06-05 21:59:27 +02:00 · 2024-04-20 19:58:29 +03:00
parent b3bbec83b6
commit db78346bef
5 changed files with 195 additions and 4 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -27,6 +27,7 @@
                "form-data": "^4.0.0",
                "google-translate-api-browser": "^3.0.1",
                "gpt3-tokenizer": "^1.1.5",
+                "he": "^1.2.0",
                "helmet": "^7.1.0",
                "ip-matching": "^2.1.2",
                "ipaddr.js": "^2.0.1",
@ -2800,6 +2801,14 @@
                "url": "https://github.com/sponsors/ljharb"
            }
        },
+        "node_modules/he": {
+            "version": "1.2.0",
+            "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
+            "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
+            "bin": {
+                "he": "bin/he"
+            }
+        },
        "node_modules/helmet": {
            "version": "7.1.0",
            "resolved": "https://registry.npmjs.org/helmet/-/helmet-7.1.0.tgz",
--- a/package.json
+++ b/package.json
@ -17,6 +17,7 @@
        "form-data": "^4.0.0",
        "google-translate-api-browser": "^3.0.1",
        "gpt3-tokenizer": "^1.1.5",
+        "he": "^1.2.0",
        "helmet": "^7.1.0",
        "ip-matching": "^2.1.2",
        "ipaddr.js": "^2.0.1",
--- a/public/scripts/extensions/attachments/youtube-scrape.html
+++ b/public/scripts/extensions/attachments/youtube-scrape.html
@ -0,0 +1,20 @@
+<div>
+    <strong data-i18n="Enter a video URL to download its transcript.">
+        Enter a video URL or ID to download its transcript.
+    </strong>
+    <div data-i18n="Examples:" class="m-t-1">
+        Examples:
+    </div>
+    <ul class="justifyLeft">
+        <li>https://www.youtube.com/watch?v=jV1vkHv4zq8</li>
+        <li>https://youtu.be/nlLhw1mtCFA</li>
+        <li>TDpxx5UqrVU</li>
+    </ul>
+    <label>
+        Language code (optional 2-letter ISO code):
+    </label>
+    <input type="text" class="text_pole" name="youtubeLanguageCode" placeholder="e.g. en">
+    <label>
+        Video ID:
+    </label>
+</div>
--- a/public/scripts/scrapers.js
+++ b/public/scripts/scrapers.js
@ -93,8 +93,8 @@ class WebScraper {
     * Check if the scraper is available.
     * @returns {Promise<boolean>}
     */
-    isAvailable() {
-        return Promise.resolve(true);
+    async isAvailable() {
+        return true;
    }

    /**
@ -167,8 +167,8 @@ class FileScraper {
     * Check if the scraper is available.
     * @returns {Promise<boolean>}
     */
-    isAvailable() {
-        return Promise.resolve(true);
+    async isAvailable() {
+        return true;
    }

    /**
@ -199,6 +199,10 @@ class FandomScraper {
        this.iconClass = 'fa-solid fa-fire';
    }

+    /**
+     * Check if the scraper is available.
+     * @returns {Promise<boolean>}
+     */
    async isAvailable() {
        try {
            const result = await fetch('/api/plugins/fandom/probe', {
@ -289,6 +293,77 @@ class FandomScraper {
    }
 }

+/**
+ * Scrape transcript from a YouTube video.
+ * @implements {Scraper}
+ */
+class YouTubeScraper {
+    constructor() {
+        this.id = 'youtube';
+        this.name = 'YouTube';
+        this.description = 'Download a transcript from a YouTube video.';
+        this.iconClass = 'fa-solid fa-closed-captioning';
+    }
+
+    /**
+     * Check if the scraper is available.
+     * @returns {Promise<boolean>}
+     */
+    async isAvailable() {
+        return true;
+    }
+
+    /**
+     * Parse the ID of a YouTube video from a URL.
+     * @param {string} url URL of the YouTube video
+     * @returns {string} ID of the YouTube video
+     */
+    parseId(url){
+        const regex = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/|shorts\/)|(?:(?:watch)?\?v(?:i)?=|&v(?:i)?=))([^#&?]*).*/;
+        const match = url.match(regex);
+        return (match?.length && match[1] ? match[1] : url);
+    }
+
+    /**
+     * Scrape transcript from a YouTube video.
+     * @returns {Promise<File[]>} File attachments scraped from the YouTube video
+     */
+    async scrape() {
+        let lang = '';
+        const template = $(await renderExtensionTemplateAsync('attachments', 'youtube-scrape', {}));
+        const videoUrl = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 2 });
+
+        template.find('input[name="youtubeLanguageCode"]').on('input', function () {
+            lang = String($(this).val()).trim();
+        });
+
+        if (!videoUrl) {
+            return;
+        }
+
+        const id = this.parseId(String(videoUrl).trim());
+        const toast = toastr.info('Working, please wait...');
+
+        const result = await fetch('/api/serpapi/transcript', {
+            method: 'POST',
+            headers: getRequestHeaders(),
+            body: JSON.stringify({ id, lang }),
+        });
+
+        if (!result.ok) {
+            const error = await result.text();
+            throw new Error(error);
+        }
+
+        const transcript = await result.text();
+        toastr.clear(toast);
+
+        const file = new File([transcript], `YouTube - ${id} - ${Date.now()}.txt`, { type: 'text/plain' });
+        return [file];
+    }
+}
+
 ScraperManager.registerDataBankScraper(new FileScraper());
 ScraperManager.registerDataBankScraper(new WebScraper());
 ScraperManager.registerDataBankScraper(new FandomScraper());
+ScraperManager.registerDataBankScraper(new YouTubeScraper());
--- a/src/endpoints/serpapi.js
+++ b/src/endpoints/serpapi.js
@ -48,6 +48,92 @@ router.post('/search', jsonParser, async (request, response) => {
    }
 });

+/**
+ * Get the transcript of a YouTube video
+ * @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
+ */
+router.post('/transcript', jsonParser, async (request, response) => {
+    try {
+        const he = require('he');
+        const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
+        const id = request.body.id;
+        const lang = request.body.lang;
+
+        if (!id) {
+            console.log('Id is required for /transcript');
+            return response.sendStatus(400);
+        }
+
+        const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
+            headers: {
+                ...(lang && { 'Accept-Language': lang }),
+                'User-Agent': visitHeaders['User-Agent'],
+            },
+        });
+
+        const videoPageBody = await videoPageResponse.text();
+        const splittedHTML = videoPageBody.split('"captions":');
+
+        if (splittedHTML.length <= 1) {
+            if (videoPageBody.includes('class="g-recaptcha"')) {
+                throw new Error('Too many requests');
+            }
+            if (!videoPageBody.includes('"playabilityStatus":')) {
+                throw new Error('Video is not available');
+            }
+            throw new Error('Transcript not available');
+        }
+
+        const captions = (() => {
+            try {
+                return JSON.parse(splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''));
+            } catch (e) {
+                return undefined;
+            }
+        })()?.['playerCaptionsTracklistRenderer'];
+
+        if (!captions) {
+            throw new Error('Transcript disabled');
+        }
+
+        if (!('captionTracks' in captions)) {
+            throw new Error('Transcript not available');
+        }
+
+        if (lang && !captions.captionTracks.some(track => track.languageCode === lang)) {
+            throw new Error('Transcript not available in this language');
+        }
+
+        const transcriptURL = (lang ? captions.captionTracks.find(track => track.languageCode === lang) : captions.captionTracks[0]).baseUrl;
+        const transcriptResponse = await fetch(transcriptURL, {
+            headers: {
+                ...(lang && { 'Accept-Language': lang }),
+                'User-Agent': visitHeaders['User-Agent'],
+            },
+        });
+
+        if (!transcriptResponse.ok) {
+            throw new Error('Transcript request failed');
+        }
+
+        const transcriptBody = await transcriptResponse.text();
+        const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
+        const transcript = results.map((result) => ({
+            text: result[3],
+            duration: parseFloat(result[2]),
+            offset: parseFloat(result[1]),
+            lang: lang ?? captions.captionTracks[0].languageCode,
+        }));
+        // The text is double-encoded
+        const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
+
+        return response.send(transcriptText);
+    } catch (error) {
+        console.log(error);
+        return response.sendStatus(500);
+    }
+});
+
 router.post('/visit', jsonParser, async (request, response) => {
    try {
        const url = request.body.url;