Implement generic interface for adding Data Bank scrapers

This commit is contained in:
Cohee 2024-04-18 00:14:41 +03:00
parent 47a06c14d9
commit 59bb04f1b3
6 changed files with 388 additions and 204 deletions

View File

@ -215,6 +215,7 @@ import { evaluateMacros } from './scripts/macros.js';
import { currentUser, setUserControls } from './scripts/user.js';
import { callGenericPopup } from './scripts/popup.js';
import { renderTemplate, renderTemplateAsync } from './scripts/templates.js';
import { ScraperManager } from './scripts/scrapers.js';
//exporting functions and vars for mods
export {
@ -7756,6 +7757,7 @@ window['SillyTavern'].getContext = function () {
*/
renderExtensionTemplate: renderExtensionTemplate,
renderExtensionTemplateAsync: renderExtensionTemplateAsync,
registerDataBankScraper: ScraperManager.registerDataBankScraper,
callPopup: callPopup,
callGenericPopup: callGenericPopup,
mainApi: main_api,

View File

@ -31,10 +31,10 @@ import {
getStringHash,
humanFileSize,
saveBase64AsFile,
isValidUrl,
} from './utils.js';
import { extension_settings, renderExtensionTemplateAsync, saveMetadataDebounced, writeExtensionField } from './extensions.js';
import { POPUP_RESULT, POPUP_TYPE, callGenericPopup } from './popup.js';
import { ScraperManager } from './scrapers.js';
/**
* @typedef {Object} FileAttachment
@ -417,7 +417,7 @@ export function decodeStyleTags(text) {
return text.replaceAll(styleDecodeRegex, (_, style) => {
try {
let styleCleaned = unescape(style).replaceAll(/<br\/>/g, '');
let styleCleaned = unescape(style).replaceAll(/<br\/>/g, '');
const ast = css.parse(styleCleaned);
const rules = ast?.stylesheet?.rules;
if (rules) {
@ -677,22 +677,71 @@ async function openAttachmentManager() {
/**
* Renders buttons for the attachment manager.
* @param {string} source Source of the buttons
*/
function renderButtons(source) {
function renderButtons() {
const sources = {
[ATTACHMENT_SOURCE.GLOBAL]: '.globalAttachmentsTitle',
[ATTACHMENT_SOURCE.CHARACTER]: '.characterAttachmentsTitle',
[ATTACHMENT_SOURCE.CHAT]: '.chatAttachmentsTitle',
};
const buttonsList = template.find('.actionButtonsTemplate .actionButtons').clone();
buttonsList.find('.menu_button').data('attachment-manager-target', source);
template.find(sources[source]).append(buttonsList);
const modal = template.find('.actionButtonsModal').hide();
const scrapers = ScraperManager.getDataBankScrapers();
for (const scraper of scrapers) {
const buttonTemplate = template.find('.actionButtonTemplate .actionButton').clone();
buttonTemplate.find('.actionButtonIcon').addClass(scraper.iconClass);
buttonTemplate.find('.actionButtonText').text(scraper.name);
buttonTemplate.attr('title', scraper.description);
buttonTemplate.on('click', () => {
const target = modal.attr('data-attachment-manager-target');
runScraper(scraper.id, target, renderAttachments);
});
modal.append(buttonTemplate);
}
const modalButtonData = Object.entries(sources).map(entry => {
const [source, selector] = entry;
const button = template.find(selector).find('.openActionModalButton').get(0);
if (!button) {
return;
}
const bodyListener = (e) => {
if (modal.is(':visible') && (!$(e.target).closest('.openActionModalButton').length)) {
modal.hide();
}
// Replay a click if the modal was already open by another button
if ($(e.target).closest('.openActionModalButton').length && !modal.is(':visible')) {
modal.show();
}
};
document.body.addEventListener('click', bodyListener);
const popper = Popper.createPopper(button, modal.get(0), { placement: 'bottom-end' });
button.addEventListener('click', () => {
modal.attr('data-attachment-manager-target', source);
modal.toggle();
popper.update();
});
return [popper, bodyListener];
}).filter(Boolean);
return () => {
modalButtonData.forEach(p => {
const [popper, bodyListener] = p;
popper.destroy();
document.body.removeEventListener('click', bodyListener);
});
modal.remove();
};
}
async function renderAttachments() {
/** @type {FileAttachment[]} */
/** @type {FileAttachment[]} */
const globalAttachments = extension_settings.attachments ?? [];
/** @type {FileAttachment[]} */
const chatAttachments = chat_metadata.attachments ?? [];
@ -718,26 +767,15 @@ async function openAttachmentManager() {
let sortField = localStorage.getItem('DataBank_sortField') || 'created';
let sortOrder = localStorage.getItem('DataBank_sortOrder') || 'desc';
let filterString = '';
const hasFandomPlugin = await isFandomPluginAvailable();
const template = $(await renderExtensionTemplateAsync('attachments', 'manager', {}));
renderButtons(ATTACHMENT_SOURCE.GLOBAL);
renderButtons(ATTACHMENT_SOURCE.CHARACTER);
renderButtons(ATTACHMENT_SOURCE.CHAT);
template.find('.scrapeWebpageButton').on('click', function () {
openWebpageScraper(String($(this).data('attachment-manager-target')), renderAttachments);
});
template.find('.scrapeFandomButton').toggle(hasFandomPlugin).on('click', function () {
openFandomScraper(String($(this).data('attachment-manager-target')), renderAttachments);
});
template.find('.uploadFileButton').on('click', function () {
openFileUploader(String($(this).data('attachment-manager-target')), renderAttachments);
});
template.find('.attachmentSearch').on('input', function () {
filterString = String($(this).val());
renderAttachments();
});
template.find('.attachmentSort').on('change', function () {
if (!(this instanceof HTMLSelectElement) || this.selectedOptions.length === 0) {
if (!(this instanceof HTMLSelectElement) || this.selectedOptions.length === 0) {
return;
}
@ -747,165 +785,48 @@ async function openAttachmentManager() {
localStorage.setItem('DataBank_sortOrder', sortOrder);
renderAttachments();
});
const cleanupFn = renderButtons();
await renderAttachments();
callGenericPopup(template, POPUP_TYPE.TEXT, '', { wide: true, large: true, okButton: 'Close' });
await callGenericPopup(template, POPUP_TYPE.TEXT, '', { wide: true, large: true, okButton: 'Close' });
cleanupFn();
}
/**
* Scrapes a webpage for attachments.
* Runs a known scraper on a source and saves the result as an attachment.
* @param {string} scraperId Id of the scraper
* @param {string} target Target for the attachment
* @param {function} callback Callback function
* @returns {Promise<void>} A promise that resolves when the source is scraped.
*/
async function openWebpageScraper(target, callback) {
const template = $(await renderExtensionTemplateAsync('attachments', 'web-scrape', {}));
const link = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' });
if (!link) {
return;
}
async function runScraper(scraperId, target, callback) {
try {
if (!isValidUrl(link)) {
toastr.error('Invalid URL');
console.log(`Running scraper ${scraperId} for ${target}`);
const files = await ScraperManager.runDataBankScraper(scraperId);
if (!Array.isArray(files)) {
console.warn('Scraping returned nothing');
return;
}
const result = await fetch('/api/serpapi/visit', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({ url: link }),
});
const blob = await result.blob();
const domain = new URL(link).hostname;
const timestamp = Date.now();
const title = await getTitleFromHtmlBlob(blob) || 'webpage';
const file = new File([blob], `${title} - ${domain} - ${timestamp}.html`, { type: 'text/html' });
await uploadFileAttachmentToServer(file, target);
callback();
} catch (error) {
console.error('Scraping failed', error);
toastr.error('Check browser console for details.', 'Scraping failed');
}
}
/**
*
* @param {Blob} blob Blob of the HTML file
* @returns {Promise<string>} Title of the HTML file
*/
async function getTitleFromHtmlBlob(blob) {
const text = await blob.text();
const titleMatch = text.match(/<title>(.*?)<\/title>/i);
return titleMatch ? titleMatch[1] : '';
}
/**
* Scrapes a Fandom page for attachments.
* @param {string} target Target for the attachment
* @param {function} callback Callback function
*/
async function openFandomScraper(target, callback) {
if (!await isFandomPluginAvailable()) {
toastr.error('Fandom scraper plugin is not available');
return;
}
let fandom = '';
let filter = '';
let output = 'single';
const template = $(await renderExtensionTemplateAsync('attachments', 'fandom-scrape', {}));
template.find('input[name="fandomScrapeInput"]').on('input', function () {
fandom = String($(this).val());
});
template.find('input[name="fandomScrapeFilter"]').on('input', function () {
filter = String($(this).val());
});
template.find('input[name="fandomScrapeOutput"]').on('input', function () {
output = String($(this).val());
});
const confirm = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' });
if (confirm !== POPUP_RESULT.AFFIRMATIVE) {
return;
}
if (!fandom) {
toastr.error('Fandom name is required');
return;
}
try {
const result = await fetch('/api/plugins/fandom/scrape', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({ fandom, filter }),
});
if (!result.ok) {
const error = await result.text();
throw new Error(error);
if (files.length === 0) {
console.warn('Scraping returned no files');
toastr.info('No files were scraped.', 'Data Bank');
return;
}
// Get domain name part if it's a URL
try {
const url = new URL(fandom);
const fandomId = url.hostname.split('.')[0] || fandom;
fandom = fandomId;
} catch {
// Ignore
}
const data = await result.json();
let numberOfAttachments;
if (output === 'multi') {
numberOfAttachments = data.length;
for (const attachment of data) {
const file = new File([String(attachment.content).trim()], `${String(attachment.title).trim()}.txt`, { type: 'text/plain' });
await uploadFileAttachmentToServer(file, target);
}
}
if (output === 'single') {
numberOfAttachments = 1;
const combinedContent = data.map((a) => String(a.title).trim() + '\n\n' + String(a.content).trim()).join('\n\n\n\n');
const file = new File([combinedContent], `${fandom}.txt`, { type: 'text/plain' });
for (const file of files) {
await uploadFileAttachmentToServer(file, target);
}
if (numberOfAttachments) {
toastr.success(`Scraped ${numberOfAttachments} attachments from ${fandom}`);
}
toastr.success(`Scraped ${files.length} files from ${scraperId} to ${target}.`, 'Data Bank');
callback();
} catch (error) {
console.error('Fandom scraping failed', error);
toastr.error('Check browser console for details.', 'Fandom scraping failed');
}
}
/**
* Uploads a file attachment.
* @param {string} target File upload target
* @param {function} callback Callback function
*/
async function openFileUploader(target, callback) {
const fileInput = document.createElement('input');
fileInput.type = 'file';
fileInput.accept = '.txt, .md, .pdf, .html, .htm';
fileInput.onchange = async function () {
const file = fileInput.files[0];
if (!file) return;
await uploadFileAttachmentToServer(file, target);
callback();
};
fileInput.click();
catch (error) {
console.error('Scraping failed', error);
toastr.error('Check browser console for details.', 'Scraping failed');
}
}
/**
@ -1008,24 +929,6 @@ export function getDataBankAttachments() {
return [...globalAttachments, ...chatAttachments, ...characterAttachments];
}
/**
* Probes the server to check if the Fandom plugin is available.
* @returns {Promise<boolean>} True if the plugin is available, false otherwise.
*/
async function isFandomPluginAvailable() {
try {
const result = await fetch('/api/plugins/fandom/probe', {
method: 'POST',
headers: getRequestHeaders(),
});
return result.ok;
} catch (error) {
console.debug('Could not probe Fandom plugin', error);
return false;
}
}
jQuery(function () {
$(document).on('click', '.mes_hide', async function () {
const messageBlock = $(this).closest('.mes');

View File

@ -39,6 +39,10 @@
<span data-i18n="Global Attachments">
Global Attachments
</span>
<div class="openActionModalButton menu_button menu_button_icon">
<i class="fa-solid fa-plus"></i>
<span data-i18n="Add">Add</span>
</div>
</h3>
<small data-i18n="These files are available for all characters in all chats.">
These files are available for all characters in all chats.
@ -51,6 +55,10 @@
<span data-i18n="Character Attachments">
Character Attachments
</span>
<div class="openActionModalButton menu_button menu_button_icon">
<i class="fa-solid fa-plus"></i>
<span data-i18n="Add">Add</span>
</div>
</h3>
<div class="flex-container flexFlowColumn">
<strong><small class="characterAttachmentsName"></small></strong>
@ -66,6 +74,10 @@
<span data-i18n="Chat Attachments">
Chat Attachments
</span>
<div class="openActionModalButton menu_button menu_button_icon">
<i class="fa-solid fa-plus"></i>
<span data-i18n="Add">Add</span>
</div>
</h3>
<div class="flex-container flexFlowColumn">
<strong><small class="chatAttachmentsName"></small></strong>
@ -87,26 +99,12 @@
</div>
</div>
<div class="actionButtonsTemplate template_element">
<div class="actionButtons flex-container flexGap10">
<div class="scrapeWebpageButton menu_button_icon menu_button" data-attachment-manager-target="" title="Download a page from the web.">
<i class="fa-fw fa-solid fa-globe"></i>
<span data-i18n="From Web">
From Web
</span>
</div>
<div class="scrapeFandomButton menu_button_icon menu_button" data-attachment-manager-target="" title="Download a page from the Fandom wiki.">
<i class="fa-fw fa-solid fa-fire"></i>
<span data-i18n="From Fandom">
From Fandom
</span>
</div>
<div class="uploadFileButton menu_button_icon menu_button" data-attachment-manager-target="" title="Upload a file from your computer.">
<i class="fa-fw fa-solid fa-upload"></i>
<span data-i18n="From File">
From File
</span>
</div>
<div class="actionButtonTemplate">
<div class="actionButton list-group-item flex-container flexGap5" title="">
<i class="actionButtonIcon"></i>
<span class="actionButtonText"></span>
</div>
</div>
<div class="actionButtonsModal popper-modal options-content list-group"></div>
</div>

View File

@ -1,3 +1,3 @@
<div data-i18n="Enter a web address to scrape:">
Enter a web address to scrape:
<div data-i18n="Enter web URLs to scrape (one per line):">
Enter web URLs to scrape (one per line):
</div>

280
public/scripts/scrapers.js Normal file
View File

@ -0,0 +1,280 @@
import { getRequestHeaders } from '../script.js';
import { renderExtensionTemplateAsync } from './extensions.js';
import { POPUP_RESULT, POPUP_TYPE, callGenericPopup } from './popup.js';
import { isValidUrl } from './utils.js';
/**
* @typedef {Object} Scraper
* @property {string} id
* @property {string} name
* @property {string} description
* @property {string} iconClass
* @property {() => Promise<boolean>} isAvailable
* @property {() => Promise<File[]>} scrape
*/
/**
* @typedef {Object} ScraperInfo
* @property {string} id
* @property {string} name
* @property {string} description
* @property {string} iconClass
*/
export class ScraperManager {
/**
* @type {Scraper[]}
*/
static #scrapers = [];
/**
* Register a scraper to be used by the Data Bank.
* @param {Scraper} scraper Instance of a scraper to register
*/
static registerDataBankScraper(scraper) {
if (ScraperManager.#scrapers.some(s => s.id === scraper.id)) {
console.warn(`Scraper with ID ${scraper.id} already registered`);
return;
}
ScraperManager.#scrapers.push(scraper);
}
/**
* Gets a list of scrapers available for the Data Bank.
* @returns {ScraperInfo[]} List of scrapers available for the Data Bank
*/
static getDataBankScrapers() {
return ScraperManager.#scrapers.map(s => ({ id: s.id, name: s.name, description: s.description, iconClass: s.iconClass }));
}
/**
* Run a scraper to scrape data into the Data Bank.
* @param {string} scraperId ID of the scraper to run
* @returns {Promise<File[]>} List of files scraped by the scraper
*/
static runDataBankScraper(scraperId) {
const scraper = ScraperManager.#scrapers.find(s => s.id === scraperId);
if (!scraper) {
console.warn(`Scraper with ID ${scraperId} not found`);
return;
}
return scraper.scrape();
}
}
/**
* Scrape data from a webpage.
* @implements {Scraper}
*/
class WebScraper {
constructor() {
this.id = 'web';
this.name = 'Web';
this.description = 'Download a page from the web.';
this.iconClass = 'fa-solid fa-globe';
}
/**
* Check if the scraper is available.
* @returns {Promise<boolean>}
*/
isAvailable() {
return Promise.resolve(true);
}
/**
* Parse the title of an HTML file from a Blob.
* @param {Blob} blob Blob of the HTML file
* @returns {Promise<string>} Title of the HTML file
*/
async getTitleFromHtmlBlob(blob) {
const text = await blob.text();
const titleMatch = text.match(/<title>(.*?)<\/title>/i);
return titleMatch ? titleMatch[1] : '';
}
/**
* Scrape file attachments from a webpage.
* @returns {Promise<File[]>} File attachments scraped from the webpage
*/
async scrape() {
const template = $(await renderExtensionTemplateAsync('attachments', 'web-scrape', {}));
const linksString = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 4 });
if (!linksString) {
return;
}
const links = String(linksString).split('\n').map(l => l.trim()).filter(l => l).filter(l => isValidUrl(l));
if (links.length === 0) {
toastr.error('Invalid URL');
return;
}
const toast = toastr.info('Working, please wait...');
const files = [];
for (const link of links) {
const result = await fetch('/api/serpapi/visit', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({ url: link }),
});
const blob = await result.blob();
const domain = new URL(link).hostname;
const timestamp = Date.now();
const title = await this.getTitleFromHtmlBlob(blob) || 'webpage';
const file = new File([blob], `${title} - ${domain} - ${timestamp}.html`, { type: 'text/html' });
files.push(file);
}
toastr.clear(toast);
return files;
}
}
/**
* Scrape data from a file selection.
* @implements {Scraper}
*/
class FileScraper {
constructor() {
this.id = 'file';
this.name = 'File';
this.description = 'Upload a file from your computer.';
this.iconClass = 'fa-solid fa-upload';
}
/**
* Check if the scraper is available.
* @returns {Promise<boolean>}
*/
isAvailable() {
return Promise.resolve(true);
}
/**
* Scrape file attachments from a file.
* @returns {Promise<File[]>} File attachments scraped from the files
*/
async scrape() {
return new Promise(resolve => {
const fileInput = document.createElement('input');
fileInput.type = 'file';
fileInput.accept = '.txt, .md, .pdf, .html, .htm';
fileInput.multiple = true;
fileInput.onchange = () => resolve(Array.from(fileInput.files));
fileInput.click();
});
}
}
/**
* Scrape data from a Fandom wiki.
* @implements {Scraper}
*/
class FandomScraper {
constructor() {
this.id = 'fandom';
this.name = 'Fandom';
this.description = 'Download a page from the Fandom wiki.';
this.iconClass = 'fa-solid fa-fire';
}
async isAvailable() {
try {
const result = await fetch('/api/plugins/fandom/probe', {
method: 'POST',
headers: getRequestHeaders(),
});
return result.ok;
} catch (error) {
console.debug('Could not probe Fandom plugin', error);
return false;
}
}
/**
* Get the ID of a fandom from a URL or name.
* @param {string} fandom URL or name of the fandom
* @returns {string} ID of the fandom
*/
getFandomId(fandom) {
try {
const url = new URL(fandom);
return url.hostname.split('.')[0] || fandom;
} catch {
return fandom;
}
}
async scrape() {
let fandom = '';
let filter = '';
let output = 'single';
const template = $(await renderExtensionTemplateAsync('attachments', 'fandom-scrape', {}));
template.find('input[name="fandomScrapeInput"]').on('input', function () {
fandom = String($(this).val()).trim();
});
template.find('input[name="fandomScrapeFilter"]').on('input', function () {
filter = String($(this).val());
});
template.find('input[name="fandomScrapeOutput"]').on('input', function () {
output = String($(this).val());
});
const confirm = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' });
if (confirm !== POPUP_RESULT.AFFIRMATIVE) {
return;
}
if (!fandom) {
toastr.error('Fandom name is required');
return;
}
const toast = toastr.info('Working, please wait...');
const result = await fetch('/api/plugins/fandom/scrape', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({ fandom, filter }),
});
if (!result.ok) {
const error = await result.text();
throw new Error(error);
}
const data = await result.json();
toastr.clear(toast);
if (output === 'multi') {
const files = [];
for (const attachment of data) {
const file = new File([String(attachment.content).trim()], `${String(attachment.title).trim()}.txt`, { type: 'text/plain' });
files.push(file);
}
return files;
}
if (output === 'single') {
const combinedContent = data.map((a) => String(a.title).trim() + '\n\n' + String(a.content).trim()).join('\n\n\n\n');
const file = new File([combinedContent], `${fandom}.txt`, { type: 'text/plain' });
return [file];
}
return [];
}
}
ScraperManager.registerDataBankScraper(new FileScraper());
ScraperManager.registerDataBankScraper(new WebScraper());
ScraperManager.registerDataBankScraper(new FandomScraper());

View File

@ -709,7 +709,8 @@ body .panelControlBar {
}
#options,
#extensionsMenu {
#extensionsMenu,
.shadow_popup .popper-modal {
display: flex;
z-index: 29999;
background-color: var(--SmartThemeBlurTintColor);