Allow returning page if transcript extraction failed

This commit is contained in:
Cohee 2024-10-04 01:12:12 +03:00
parent 6cb82fc21e
commit 777b2518bd

View File

@ -22,59 +22,15 @@ const visitHeaders = {
'Sec-Fetch-User': '?1',
};
router.post('/serpapi', jsonParser, async (request, response) => {
try {
const key = readSecret(request.user.directories, SECRET_KEYS.SERPAPI);
if (!key) {
console.log('No SerpApi key found');
return response.sendStatus(400);
}
const { query } = request.body;
const result = await fetch(`https://serpapi.com/search.json?q=${encodeURIComponent(query)}&api_key=${key}`);
console.log('SerpApi query', query);
if (!result.ok) {
const text = await result.text();
console.log('SerpApi request failed', result.statusText, text);
return response.status(500).send(text);
}
const data = await result.json();
return response.json(data);
} catch (error) {
console.log(error);
return response.sendStatus(500);
}
});
/**
* Get the transcript of a YouTube video
* @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
* Extract the transcript of a YouTube video
* @param {string} videoPageBody HTML of the video page
* @param {string} lang Language code
* @returns {Promise<string>} Transcript text
*/
router.post('/transcript', jsonParser, async (request, response) => {
try {
async function extractTranscript(videoPageBody, lang) {
const he = require('he');
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
const id = request.body.id;
const lang = request.body.lang;
const json = request.body.json;
if (!id) {
console.log('Id is required for /transcript');
return response.sendStatus(400);
}
const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
headers: {
...(lang && { 'Accept-Language': lang }),
'User-Agent': visitHeaders['User-Agent'],
},
});
const videoPageBody = await videoPageResponse.text();
const splittedHTML = videoPageBody.split('"captions":');
if (splittedHTML.length <= 1) {
@ -129,10 +85,72 @@ router.post('/transcript', jsonParser, async (request, response) => {
}));
// The text is double-encoded
const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
return transcriptText;
}
router.post('/serpapi', jsonParser, async (request, response) => {
try {
const key = readSecret(request.user.directories, SECRET_KEYS.SERPAPI);
if (!key) {
console.log('No SerpApi key found');
return response.sendStatus(400);
}
const { query } = request.body;
const result = await fetch(`https://serpapi.com/search.json?q=${encodeURIComponent(query)}&api_key=${key}`);
console.log('SerpApi query', query);
if (!result.ok) {
const text = await result.text();
console.log('SerpApi request failed', result.statusText, text);
return response.status(500).send(text);
}
const data = await result.json();
return response.json(data);
} catch (error) {
console.log(error);
return response.sendStatus(500);
}
});
/**
* Get the transcript of a YouTube video
* @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
*/
router.post('/transcript', jsonParser, async (request, response) => {
try {
const id = request.body.id;
const lang = request.body.lang;
const json = request.body.json;
if (!id) {
console.log('Id is required for /transcript');
return response.sendStatus(400);
}
const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
headers: {
...(lang && { 'Accept-Language': lang }),
'User-Agent': visitHeaders['User-Agent'],
},
});
const videoPageBody = await videoPageResponse.text();
try {
const transcriptText = await extractTranscript(videoPageBody, lang);
return json
? response.json({ transcript: transcriptText, html: videoPageBody })
: response.send(transcriptText);
} catch (error) {
if (json) {
return response.json({ html: videoPageBody, transcript: '' });
}
throw error;
}
} catch (error) {
console.log(error);
return response.sendStatus(500);