mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Add YT script loader for data bank
This commit is contained in:
9
package-lock.json
generated
9
package-lock.json
generated
@ -27,6 +27,7 @@
|
||||
"form-data": "^4.0.0",
|
||||
"google-translate-api-browser": "^3.0.1",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
"he": "^1.2.0",
|
||||
"helmet": "^7.1.0",
|
||||
"ip-matching": "^2.1.2",
|
||||
"ipaddr.js": "^2.0.1",
|
||||
@ -2800,6 +2801,14 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/he": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
|
||||
"integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
|
||||
"bin": {
|
||||
"he": "bin/he"
|
||||
}
|
||||
},
|
||||
"node_modules/helmet": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/helmet/-/helmet-7.1.0.tgz",
|
||||
|
@ -17,6 +17,7 @@
|
||||
"form-data": "^4.0.0",
|
||||
"google-translate-api-browser": "^3.0.1",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
"he": "^1.2.0",
|
||||
"helmet": "^7.1.0",
|
||||
"ip-matching": "^2.1.2",
|
||||
"ipaddr.js": "^2.0.1",
|
||||
|
20
public/scripts/extensions/attachments/youtube-scrape.html
Normal file
20
public/scripts/extensions/attachments/youtube-scrape.html
Normal file
@ -0,0 +1,20 @@
|
||||
<div>
|
||||
<strong data-i18n="Enter a video URL to download its transcript.">
|
||||
Enter a video URL or ID to download its transcript.
|
||||
</strong>
|
||||
<div data-i18n="Examples:" class="m-t-1">
|
||||
Examples:
|
||||
</div>
|
||||
<ul class="justifyLeft">
|
||||
<li>https://www.youtube.com/watch?v=jV1vkHv4zq8</li>
|
||||
<li>https://youtu.be/nlLhw1mtCFA</li>
|
||||
<li>TDpxx5UqrVU</li>
|
||||
</ul>
|
||||
<label>
|
||||
Language code (optional 2-letter ISO code):
|
||||
</label>
|
||||
<input type="text" class="text_pole" name="youtubeLanguageCode" placeholder="e.g. en">
|
||||
<label>
|
||||
Video ID:
|
||||
</label>
|
||||
</div>
|
@ -93,8 +93,8 @@ class WebScraper {
|
||||
* Check if the scraper is available.
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
isAvailable() {
|
||||
return Promise.resolve(true);
|
||||
async isAvailable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -167,8 +167,8 @@ class FileScraper {
|
||||
* Check if the scraper is available.
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
isAvailable() {
|
||||
return Promise.resolve(true);
|
||||
async isAvailable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -199,6 +199,10 @@ class FandomScraper {
|
||||
this.iconClass = 'fa-solid fa-fire';
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the scraper is available.
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async isAvailable() {
|
||||
try {
|
||||
const result = await fetch('/api/plugins/fandom/probe', {
|
||||
@ -289,6 +293,77 @@ class FandomScraper {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape transcript from a YouTube video.
|
||||
* @implements {Scraper}
|
||||
*/
|
||||
class YouTubeScraper {
|
||||
constructor() {
|
||||
this.id = 'youtube';
|
||||
this.name = 'YouTube';
|
||||
this.description = 'Download a transcript from a YouTube video.';
|
||||
this.iconClass = 'fa-solid fa-closed-captioning';
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the scraper is available.
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async isAvailable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the ID of a YouTube video from a URL.
|
||||
* @param {string} url URL of the YouTube video
|
||||
* @returns {string} ID of the YouTube video
|
||||
*/
|
||||
parseId(url){
|
||||
const regex = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/|shorts\/)|(?:(?:watch)?\?v(?:i)?=|&v(?:i)?=))([^#&?]*).*/;
|
||||
const match = url.match(regex);
|
||||
return (match?.length && match[1] ? match[1] : url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape transcript from a YouTube video.
|
||||
* @returns {Promise<File[]>} File attachments scraped from the YouTube video
|
||||
*/
|
||||
async scrape() {
|
||||
let lang = '';
|
||||
const template = $(await renderExtensionTemplateAsync('attachments', 'youtube-scrape', {}));
|
||||
const videoUrl = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 2 });
|
||||
|
||||
template.find('input[name="youtubeLanguageCode"]').on('input', function () {
|
||||
lang = String($(this).val()).trim();
|
||||
});
|
||||
|
||||
if (!videoUrl) {
|
||||
return;
|
||||
}
|
||||
|
||||
const id = this.parseId(String(videoUrl).trim());
|
||||
const toast = toastr.info('Working, please wait...');
|
||||
|
||||
const result = await fetch('/api/serpapi/transcript', {
|
||||
method: 'POST',
|
||||
headers: getRequestHeaders(),
|
||||
body: JSON.stringify({ id, lang }),
|
||||
});
|
||||
|
||||
if (!result.ok) {
|
||||
const error = await result.text();
|
||||
throw new Error(error);
|
||||
}
|
||||
|
||||
const transcript = await result.text();
|
||||
toastr.clear(toast);
|
||||
|
||||
const file = new File([transcript], `YouTube - ${id} - ${Date.now()}.txt`, { type: 'text/plain' });
|
||||
return [file];
|
||||
}
|
||||
}
|
||||
|
||||
ScraperManager.registerDataBankScraper(new FileScraper());
|
||||
ScraperManager.registerDataBankScraper(new WebScraper());
|
||||
ScraperManager.registerDataBankScraper(new FandomScraper());
|
||||
ScraperManager.registerDataBankScraper(new YouTubeScraper());
|
||||
|
@ -48,6 +48,92 @@ router.post('/search', jsonParser, async (request, response) => {
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Get the transcript of a YouTube video
|
||||
* @copyright https://github.com/Kakulukian/youtube-transcript (MIT License)
|
||||
*/
|
||||
router.post('/transcript', jsonParser, async (request, response) => {
|
||||
try {
|
||||
const he = require('he');
|
||||
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
|
||||
const id = request.body.id;
|
||||
const lang = request.body.lang;
|
||||
|
||||
if (!id) {
|
||||
console.log('Id is required for /transcript');
|
||||
return response.sendStatus(400);
|
||||
}
|
||||
|
||||
const videoPageResponse = await fetch(`https://www.youtube.com/watch?v=${id}`, {
|
||||
headers: {
|
||||
...(lang && { 'Accept-Language': lang }),
|
||||
'User-Agent': visitHeaders['User-Agent'],
|
||||
},
|
||||
});
|
||||
|
||||
const videoPageBody = await videoPageResponse.text();
|
||||
const splittedHTML = videoPageBody.split('"captions":');
|
||||
|
||||
if (splittedHTML.length <= 1) {
|
||||
if (videoPageBody.includes('class="g-recaptcha"')) {
|
||||
throw new Error('Too many requests');
|
||||
}
|
||||
if (!videoPageBody.includes('"playabilityStatus":')) {
|
||||
throw new Error('Video is not available');
|
||||
}
|
||||
throw new Error('Transcript not available');
|
||||
}
|
||||
|
||||
const captions = (() => {
|
||||
try {
|
||||
return JSON.parse(splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''));
|
||||
} catch (e) {
|
||||
return undefined;
|
||||
}
|
||||
})()?.['playerCaptionsTracklistRenderer'];
|
||||
|
||||
if (!captions) {
|
||||
throw new Error('Transcript disabled');
|
||||
}
|
||||
|
||||
if (!('captionTracks' in captions)) {
|
||||
throw new Error('Transcript not available');
|
||||
}
|
||||
|
||||
if (lang && !captions.captionTracks.some(track => track.languageCode === lang)) {
|
||||
throw new Error('Transcript not available in this language');
|
||||
}
|
||||
|
||||
const transcriptURL = (lang ? captions.captionTracks.find(track => track.languageCode === lang) : captions.captionTracks[0]).baseUrl;
|
||||
const transcriptResponse = await fetch(transcriptURL, {
|
||||
headers: {
|
||||
...(lang && { 'Accept-Language': lang }),
|
||||
'User-Agent': visitHeaders['User-Agent'],
|
||||
},
|
||||
});
|
||||
|
||||
if (!transcriptResponse.ok) {
|
||||
throw new Error('Transcript request failed');
|
||||
}
|
||||
|
||||
const transcriptBody = await transcriptResponse.text();
|
||||
const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
|
||||
const transcript = results.map((result) => ({
|
||||
text: result[3],
|
||||
duration: parseFloat(result[2]),
|
||||
offset: parseFloat(result[1]),
|
||||
lang: lang ?? captions.captionTracks[0].languageCode,
|
||||
}));
|
||||
// The text is double-encoded
|
||||
const transcriptText = transcript.map((line) => he.decode(he.decode(line.text))).join(' ');
|
||||
|
||||
return response.send(transcriptText);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
return response.sendStatus(500);
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/visit', jsonParser, async (request, response) => {
|
||||
try {
|
||||
const url = request.body.url;
|
||||
|
Reference in New Issue
Block a user