Allow returning page if transcript extraction failed
This commit is contained in:
parent
6cb82fc21e
commit
777b2518bd
|
@ -22,6 +22,72 @@ const visitHeaders = {
|
||||||
'Sec-Fetch-User': '?1',
|
'Sec-Fetch-User': '?1',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the transcript of a YouTube video
|
||||||
|
* @param {string} videoPageBody HTML of the video page
|
||||||
|
* @param {string} lang Language code
|
||||||
|
* @returns {Promise<string>} Transcript text
|
||||||
|
*/
|
||||||
|
async function extractTranscript(videoPageBody, lang) {
|
||||||
|
const he = require('he');
|
||||||
|
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
|
||||||
|
const splittedHTML = videoPageBody.split('"captions":');
|
||||||
|
|
||||||
|
if (splittedHTML.length <= 1) {
|
||||||
|
if (videoPageBody.includes('class="g-recaptcha"')) {
|
||||||
|
throw new Error('Too many requests');
|
||||||
|
}
|
||||||
|
if (!videoPageBody.includes('"playabilityStatus":')) {
|
||||||
|
throw new Error('Video is not available');
|
||||||
|
}
|
||||||
|
throw new Error('Transcript not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
const captions = (() => {
|
||||||
|
try {
|
||||||
|
return JSON.parse(splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''));
|
||||||
|
} catch (e) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
})()?.['playerCaptionsTracklistRenderer'];
|
||||||
|
|
||||||
|
if (!captions) {
|
||||||
|
throw new Error('Transcript disabled');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!('captionTracks' in captions)) {
|
||||||
|
throw new Error('Transcript not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lang && !captions.captionTracks.some(track => track.languageCode === lang)) {
|
||||||
|
throw new Error('Transcript not available in this language');
|
||||||
|
}
|
||||||
|
|
||||||
|
const transcriptURL = (lang ? captions.captionTracks.find(track => track.languageCode === lang) : captions.captionTracks[0]).baseUrl;
|
||||||
|
const transcriptResponse = await fetch(transcriptURL, {
|
||||||
|
headers: {
|
||||||
|
...(lang && { 'Accept-Language': lang }),
|
||||||
|
'User-Agent': visitHeaders['User-Agent'],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!transcriptResponse.ok) {
|
||||||
|
throw new Error('Transcript request failed');
|
||||||
|
}
|
||||||
|
|
||||||
|
const transcriptBody = await transcriptResponse.text();
|
||||||
|
const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
|
||||||
|
const transcript = results.map((result) => ({
|
||||||
|
text: result[3],
|
||||||
|
duration: parseFloat(result[2]),
|
||||||
|
offset: parseFloat(result[1]),
|
||||||
|
lang: lang ?? captions.captionTracks[0].languageCode,
|
||||||
|
}));
|
||||||
|
// The text is double-encoded
|
||||||
|
const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
|
||||||
|
return transcriptText;
|
||||||
|
}
|
||||||
|
|
||||||
router.post('/serpapi', jsonParser, async (request, response) => {
|
router.post('/serpapi', jsonParser, async (request, response) => {
|
||||||
try {
|
try {
|
||||||
const key = readSecret(request.user.directories, SECRET_KEYS.SERPAPI);
|
const key = readSecret(request.user.directories, SECRET_KEYS.SERPAPI);
|
||||||
|
@ -56,8 +122,6 @@ router.post('/serpapi', jsonParser, async (request, response) => {
|
||||||
*/
|
*/
|
||||||
router.post('/transcript', jsonParser, async (request, response) => {
|
router.post('/transcript', jsonParser, async (request, response) => {
|
||||||
try {
|
try {
|
||||||
const he = require('he');
|
|
||||||
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
|
|
||||||
const id = request.body.id;
|
const id = request.body.id;
|
||||||
const lang = request.body.lang;
|
const lang = request.body.lang;
|
||||||
const json = request.body.json;
|
const json = request.body.json;
|
||||||
|
@ -75,64 +139,18 @@ router.post('/transcript', jsonParser, async (request, response) => {
|
||||||
});
|
});
|
||||||
|
|
||||||
const videoPageBody = await videoPageResponse.text();
|
const videoPageBody = await videoPageResponse.text();
|
||||||
const splittedHTML = videoPageBody.split('"captions":');
|
|
||||||
|
|
||||||
if (splittedHTML.length <= 1) {
|
try {
|
||||||
if (videoPageBody.includes('class="g-recaptcha"')) {
|
const transcriptText = await extractTranscript(videoPageBody, lang);
|
||||||
throw new Error('Too many requests');
|
return json
|
||||||
|
? response.json({ transcript: transcriptText, html: videoPageBody })
|
||||||
|
: response.send(transcriptText);
|
||||||
|
} catch (error) {
|
||||||
|
if (json) {
|
||||||
|
return response.json({ html: videoPageBody, transcript: '' });
|
||||||
}
|
}
|
||||||
if (!videoPageBody.includes('"playabilityStatus":')) {
|
throw error;
|
||||||
throw new Error('Video is not available');
|
|
||||||
}
|
|
||||||
throw new Error('Transcript not available');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const captions = (() => {
|
|
||||||
try {
|
|
||||||
return JSON.parse(splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''));
|
|
||||||
} catch (e) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
})()?.['playerCaptionsTracklistRenderer'];
|
|
||||||
|
|
||||||
if (!captions) {
|
|
||||||
throw new Error('Transcript disabled');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!('captionTracks' in captions)) {
|
|
||||||
throw new Error('Transcript not available');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lang && !captions.captionTracks.some(track => track.languageCode === lang)) {
|
|
||||||
throw new Error('Transcript not available in this language');
|
|
||||||
}
|
|
||||||
|
|
||||||
const transcriptURL = (lang ? captions.captionTracks.find(track => track.languageCode === lang) : captions.captionTracks[0]).baseUrl;
|
|
||||||
const transcriptResponse = await fetch(transcriptURL, {
|
|
||||||
headers: {
|
|
||||||
...(lang && { 'Accept-Language': lang }),
|
|
||||||
'User-Agent': visitHeaders['User-Agent'],
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!transcriptResponse.ok) {
|
|
||||||
throw new Error('Transcript request failed');
|
|
||||||
}
|
|
||||||
|
|
||||||
const transcriptBody = await transcriptResponse.text();
|
|
||||||
const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
|
|
||||||
const transcript = results.map((result) => ({
|
|
||||||
text: result[3],
|
|
||||||
duration: parseFloat(result[2]),
|
|
||||||
offset: parseFloat(result[1]),
|
|
||||||
lang: lang ?? captions.captionTracks[0].languageCode,
|
|
||||||
}));
|
|
||||||
// The text is double-encoded
|
|
||||||
const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
|
|
||||||
|
|
||||||
return json
|
|
||||||
? response.json({ transcript: transcriptText, html: videoPageBody })
|
|
||||||
: response.send(transcriptText);
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
return response.sendStatus(500);
|
return response.sendStatus(500);
|
||||||
|
|
Loading…
Reference in New Issue