Allow returning page if transcript extraction failed

2024-12-12 09:26:33 +01:00 · 2024-10-04 01:12:12 +03:00 · 2024-10-04 01:12:12 +03:00 · 777b2518bd
commit 777b2518bd
parent 6cb82fc21e
1 changed files with 75 additions and 57 deletions
--- a/src/endpoints/search.js
+++ b/src/endpoints/search.js
@ -22,59 +22,15 @@ const visitHeaders = {
    'Sec-Fetch-User': '?1',
 };

-router.post('/serpapi', jsonParser, async (request, response) => {
-    try {
-        const key = readSecret(request.user.directories, SECRET_KEYS.SERPAPI);
-
-        if (!key) {
-            console.log('No SerpApi key found');
-            return response.sendStatus(400);
-        }
-
-        const { query } = request.body;
-        const result = await fetch(`https://serpapi.com/search.json?q=${encodeURIComponent(query)}&api_key=${key}`);
-
-        console.log('SerpApi query', query);
-
-        if (!result.ok) {
-            const text = await result.text();
-            console.log('SerpApi request failed', result.statusText, text);
-            return response.status(500).send(text);
-        }
-
-        const data = await result.json();
-        return response.json(data);
-    } catch (error) {
-        console.log(error);
-        return response.sendStatus(500);
-    }
-});
-
 /**
- * Get the transcript of a YouTube video
- * @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
+ * Extract the transcript of a YouTube video
+ * @param {string} videoPageBody HTML of the video page
+ * @param {string} lang Language code
+ * @returns {Promise<string>} Transcript text
 */
-router.post('/transcript', jsonParser, async (request, response) => {
-    try {
+async function extractTranscript(videoPageBody, lang) {
    const he = require('he');
    const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
-        const id = request.body.id;
-        const lang = request.body.lang;
-        const json = request.body.json;
-
-        if (!id) {
-            console.log('Id is required for /transcript');
-            return response.sendStatus(400);
-        }
-
-        const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
-            headers: {
-                ...(lang && { 'Accept-Language': lang }),
-                'User-Agent': visitHeaders['User-Agent'],
-            },
-        });
-
-        const videoPageBody = await videoPageResponse.text();
    const splittedHTML = videoPageBody.split('"captions":');

    if (splittedHTML.length <= 1) {
@ -129,10 +85,72 @@ router.post('/transcript', jsonParser, async (request, response) => {
    }));
    // The text is double-encoded
    const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
+    return transcriptText;
+}

+router.post('/serpapi', jsonParser, async (request, response) => {
+    try {
+        const key = readSecret(request.user.directories, SECRET_KEYS.SERPAPI);
+
+        if (!key) {
+            console.log('No SerpApi key found');
+            return response.sendStatus(400);
+        }
+
+        const { query } = request.body;
+        const result = await fetch(`https://serpapi.com/search.json?q=${encodeURIComponent(query)}&api_key=${key}`);
+
+        console.log('SerpApi query', query);
+
+        if (!result.ok) {
+            const text = await result.text();
+            console.log('SerpApi request failed', result.statusText, text);
+            return response.status(500).send(text);
+        }
+
+        const data = await result.json();
+        return response.json(data);
+    } catch (error) {
+        console.log(error);
+        return response.sendStatus(500);
+    }
+});
+
+/**
+ * Get the transcript of a YouTube video
+ * @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
+ */
+router.post('/transcript', jsonParser, async (request, response) => {
+    try {
+        const id = request.body.id;
+        const lang = request.body.lang;
+        const json = request.body.json;
+
+        if (!id) {
+            console.log('Id is required for /transcript');
+            return response.sendStatus(400);
+        }
+
+        const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
+            headers: {
+                ...(lang && { 'Accept-Language': lang }),
+                'User-Agent': visitHeaders['User-Agent'],
+            },
+        });
+
+        const videoPageBody = await videoPageResponse.text();
+
+        try {
+            const transcriptText = await extractTranscript(videoPageBody, lang);
            return json
                ? response.json({ transcript: transcriptText, html: videoPageBody })
                : response.send(transcriptText);
+        } catch (error) {
+            if (json) {
+                return response.json({ html: videoPageBody, transcript: '' });
+            }
+            throw error;
+        }
    } catch (error) {
        console.log(error);
        return response.sendStatus(500);