diff --git a/package-lock.json b/package-lock.json
index 04b122d5f..93c8ed8ee 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -27,6 +27,7 @@
"form-data": "^4.0.0",
"google-translate-api-browser": "^3.0.1",
"gpt3-tokenizer": "^1.1.5",
+ "he": "^1.2.0",
"helmet": "^7.1.0",
"ip-matching": "^2.1.2",
"ipaddr.js": "^2.0.1",
@@ -2800,6 +2801,14 @@
"url": "https://github.com/sponsors/ljharb"
}
},
+ "node_modules/he": {
+ "version": "1.2.0",
+ "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
+ "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
+ "bin": {
+ "he": "bin/he"
+ }
+ },
"node_modules/helmet": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/helmet/-/helmet-7.1.0.tgz",
diff --git a/package.json b/package.json
index 8f84c95b9..c40d267e4 100644
--- a/package.json
+++ b/package.json
@@ -17,6 +17,7 @@
"form-data": "^4.0.0",
"google-translate-api-browser": "^3.0.1",
"gpt3-tokenizer": "^1.1.5",
+ "he": "^1.2.0",
"helmet": "^7.1.0",
"ip-matching": "^2.1.2",
"ipaddr.js": "^2.0.1",
diff --git a/public/scripts/extensions/attachments/youtube-scrape.html b/public/scripts/extensions/attachments/youtube-scrape.html
new file mode 100644
index 000000000..298b08159
--- /dev/null
+++ b/public/scripts/extensions/attachments/youtube-scrape.html
@@ -0,0 +1,20 @@
+
+
+ Enter a video URL or ID to download its transcript.
+
+
+ Examples:
+
+
+ - https://www.youtube.com/watch?v=jV1vkHv4zq8
+ - https://youtu.be/nlLhw1mtCFA
+ - TDpxx5UqrVU
+
+
+
+
+
diff --git a/public/scripts/scrapers.js b/public/scripts/scrapers.js
index 5ac3c5dc5..7d6457ed2 100644
--- a/public/scripts/scrapers.js
+++ b/public/scripts/scrapers.js
@@ -93,8 +93,8 @@ class WebScraper {
* Check if the scraper is available.
* @returns {Promise}
*/
- isAvailable() {
- return Promise.resolve(true);
+ async isAvailable() {
+ return true;
}
/**
@@ -167,8 +167,8 @@ class FileScraper {
* Check if the scraper is available.
* @returns {Promise}
*/
- isAvailable() {
- return Promise.resolve(true);
+ async isAvailable() {
+ return true;
}
/**
@@ -199,6 +199,10 @@ class FandomScraper {
this.iconClass = 'fa-solid fa-fire';
}
+ /**
+ * Check if the scraper is available.
+ * @returns {Promise}
+ */
async isAvailable() {
try {
const result = await fetch('/api/plugins/fandom/probe', {
@@ -289,6 +293,77 @@ class FandomScraper {
}
}
+/**
+ * Scrape transcript from a YouTube video.
+ * @implements {Scraper}
+ */
+class YouTubeScraper {
+ constructor() {
+ this.id = 'youtube';
+ this.name = 'YouTube';
+ this.description = 'Download a transcript from a YouTube video.';
+ this.iconClass = 'fa-solid fa-closed-captioning';
+ }
+
+ /**
+ * Check if the scraper is available.
+ * @returns {Promise}
+ */
+ async isAvailable() {
+ return true;
+ }
+
+ /**
+ * Parse the ID of a YouTube video from a URL.
+ * @param {string} url URL of the YouTube video
+ * @returns {string} ID of the YouTube video
+ */
+ parseId(url){
+ const regex = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/|shorts\/)|(?:(?:watch)?\?v(?:i)?=|&v(?:i)?=))([^#&?]*).*/;
+ const match = url.match(regex);
+ return (match?.length && match[1] ? match[1] : url);
+ }
+
+ /**
+ * Scrape transcript from a YouTube video.
+ * @returns {Promise} File attachments scraped from the YouTube video
+ */
+ async scrape() {
+ let lang = '';
+ const template = $(await renderExtensionTemplateAsync('attachments', 'youtube-scrape', {}));
+ const videoUrl = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 2 });
+
+ template.find('input[name="youtubeLanguageCode"]').on('input', function () {
+ lang = String($(this).val()).trim();
+ });
+
+ if (!videoUrl) {
+ return;
+ }
+
+ const id = this.parseId(String(videoUrl).trim());
+ const toast = toastr.info('Working, please wait...');
+
+ const result = await fetch('/api/serpapi/transcript', {
+ method: 'POST',
+ headers: getRequestHeaders(),
+ body: JSON.stringify({ id, lang }),
+ });
+
+ if (!result.ok) {
+ const error = await result.text();
+ throw new Error(error);
+ }
+
+ const transcript = await result.text();
+ toastr.clear(toast);
+
+ const file = new File([transcript], `YouTube - ${id} - ${Date.now()}.txt`, { type: 'text/plain' });
+ return [file];
+ }
+}
+
ScraperManager.registerDataBankScraper(new FileScraper());
ScraperManager.registerDataBankScraper(new WebScraper());
ScraperManager.registerDataBankScraper(new FandomScraper());
+ScraperManager.registerDataBankScraper(new YouTubeScraper());
diff --git a/src/endpoints/serpapi.js b/src/endpoints/serpapi.js
index faae11750..15d7d0e3c 100644
--- a/src/endpoints/serpapi.js
+++ b/src/endpoints/serpapi.js
@@ -48,6 +48,92 @@ router.post('/search', jsonParser, async (request, response) => {
}
});
+/**
+ * Get the transcript of a YouTube video
+ * @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
+ */
+router.post('/transcript', jsonParser, async (request, response) => {
+ try {
+ const he = require('he');
+ const RE_XML_TRANSCRIPT = /([^<]*)<\/text>/g;
+ const id = request.body.id;
+ const lang = request.body.lang;
+
+ if (!id) {
+ console.log('Id is required for /transcript');
+ return response.sendStatus(400);
+ }
+
+ const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
+ headers: {
+ ...(lang && { 'Accept-Language': lang }),
+ 'User-Agent': visitHeaders['User-Agent'],
+ },
+ });
+
+ const videoPageBody = await videoPageResponse.text();
+ const splittedHTML = videoPageBody.split('"captions":');
+
+ if (splittedHTML.length <= 1) {
+ if (videoPageBody.includes('class="g-recaptcha"')) {
+ throw new Error('Too many requests');
+ }
+ if (!videoPageBody.includes('"playabilityStatus":')) {
+ throw new Error('Video is not available');
+ }
+ throw new Error('Transcript not available');
+ }
+
+ const captions = (() => {
+ try {
+ return JSON.parse(splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''));
+ } catch (e) {
+ return undefined;
+ }
+ })()?.['playerCaptionsTracklistRenderer'];
+
+ if (!captions) {
+ throw new Error('Transcript disabled');
+ }
+
+ if (!('captionTracks' in captions)) {
+ throw new Error('Transcript not available');
+ }
+
+ if (lang && !captions.captionTracks.some(track => track.languageCode === lang)) {
+ throw new Error('Transcript not available in this language');
+ }
+
+ const transcriptURL = (lang ? captions.captionTracks.find(track => track.languageCode === lang) : captions.captionTracks[0]).baseUrl;
+ const transcriptResponse = await fetch(transcriptURL, {
+ headers: {
+ ...(lang && { 'Accept-Language': lang }),
+ 'User-Agent': visitHeaders['User-Agent'],
+ },
+ });
+
+ if (!transcriptResponse.ok) {
+ throw new Error('Transcript request failed');
+ }
+
+ const transcriptBody = await transcriptResponse.text();
+ const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
+ const transcript = results.map((result) => ({
+ text: result[3],
+ duration: parseFloat(result[2]),
+ offset: parseFloat(result[1]),
+ lang: lang ?? captions.captionTracks[0].languageCode,
+ }));
+ // The text is double-encoded
+ const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
+
+ return response.send(transcriptText);
+ } catch (error) {
+ console.log(error);
+ return response.sendStatus(500);
+ }
+});
+
router.post('/visit', jsonParser, async (request, response) => {
try {
const url = request.body.url;