[FEATURE_REQUEST] Sending PDF/HTML files? #1414

This commit is contained in:
Cohee 2023-11-29 17:51:30 +02:00
parent 1ce009b84e
commit e0bf2b8e3e
10 changed files with 74770 additions and 19 deletions

View File

@ -22,7 +22,8 @@
"droll",
"handlebars",
"highlight.js",
"localforage"
"localforage",
"pdfjs-dist"
]
}
}

17398
public/lib/pdf.mjs Normal file

File diff suppressed because it is too large Load Diff

1
public/lib/pdf.mjs.map Normal file

File diff suppressed because one or more lines are too long

57124
public/lib/pdf.worker.mjs Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -195,7 +195,7 @@ import { getBackgrounds, initBackgrounds } from "./scripts/backgrounds.js";
import { hideLoader, showLoader } from "./scripts/loader.js";
import { CharacterContextMenu, BulkEditOverlay } from "./scripts/BulkEditOverlay.js";
import { loadMancerModels } from "./scripts/mancer-settings.js";
import { hasPendingFileAttachment, populateFileAttachment } from "./scripts/chats.js";
import { getFileAttachment, hasPendingFileAttachment, populateFileAttachment } from "./scripts/chats.js";
import { replaceVariableMacros } from "./scripts/variables.js";
//exporting functions and vars for mods
@ -3019,22 +3019,27 @@ async function Generate(type, { automatic_trigger, force_name2, resolve, reject,
coreChat.pop();
}
coreChat = coreChat.map(chatItem => {
coreChat = await Promise.all(coreChat.map(async (chatItem) => {
let message = chatItem.mes;
let regexType = chatItem.is_user ? regex_placement.USER_INPUT : regex_placement.AI_OUTPUT;
let options = { isPrompt: true };
let regexedMessage = getRegexedString(message, regexType, options);
if (chatItem.extra?.file?.text) {
regexedMessage += `\n\n${chatItem.extra.file.text}`;
if (chatItem.extra?.file) {
const fileText = chatItem.extra.file.text || (await getFileAttachment(chatItem.extra.file.url));
if (fileText) {
chatItem.extra.fileStart = regexedMessage.length;
regexedMessage += `\n\n${fileText}`;
}
}
return {
...chatItem,
mes: regexedMessage,
};
});
}));
// Determine token limit
let this_max_context = getMaxContextSize();

View File

@ -8,14 +8,33 @@ import {
eventSource,
event_types,
getCurrentChatId,
getRequestHeaders,
hideSwipeButtons,
name2,
saveChatDebounced,
showSwipeButtons,
} from "../script.js";
import { getBase64Async, humanFileSize, saveBase64AsFile } from "./utils.js";
import {
extractTextFromHTML,
extractTextFromMarkdown,
extractTextFromPDF,
getBase64Async,
getStringHash,
humanFileSize,
saveBase64AsFile,
} from "./utils.js";
const fileSizeLimit = 1024 * 1024 * 1; // 1 MB
const fileSizeLimit = 1024 * 1024 * 10; // 10 MB
const converters = {
'application/pdf': extractTextFromPDF,
'text/html': extractTextFromHTML,
'text/markdown': extractTextFromMarkdown,
}
function isConvertible(type) {
return Object.keys(converters).includes(type);
}
/**
* Mark message as hidden (system message).
@ -70,7 +89,7 @@ export async function unhideChatMessage(messageId, messageBlock) {
/**
* Adds a file attachment to the message.
* @param {object} message Message object
* @returns {Promise<void>}
* @returns {Promise<void>} A promise that resolves when file is uploaded.
*/
export async function populateFileAttachment(message, inputId = 'file_form_input') {
try {
@ -81,18 +100,38 @@ export async function populateFileAttachment(message, inputId = 'file_form_input
const file = fileInput.files[0];
if (!file) return;
const fileBase64 = await getBase64Async(file);
let base64Data = fileBase64.split(',')[1];
// If file is image
if (file.type.startsWith('image/')) {
const base64Img = await getBase64Async(file);
const base64ImgData = base64Img.split(',')[1];
const extension = file.type.split('/')[1];
const imageUrl = await saveBase64AsFile(base64ImgData, name2, file.name, extension);
const imageUrl = await saveBase64AsFile(base64Data, name2, file.name, extension);
message.extra.image = imageUrl;
message.extra.inline_image = true;
} else {
const fileText = await file.text();
const slug = getStringHash(file.name);
const uniqueFileName = `${Date.now()}_${slug}.txt`;
if (isConvertible(file.type)) {
try {
const converter = converters[file.type];
const fileText = await converter(file);
base64Data = window.btoa(unescape(encodeURIComponent(fileText)));
} catch (error) {
toastr.error(error, 'Could not convert file');
console.error('Could not convert file', error);
}
}
const fileUrl = await uploadFileAttachment(uniqueFileName, base64Data);
if (!fileUrl) {
return;
}
message.extra.file = {
text: fileText,
url: fileUrl,
size: file.size,
name: file.name,
};
@ -105,6 +144,62 @@ export async function populateFileAttachment(message, inputId = 'file_form_input
}
}
/**
* Uploads file to the server.
* @param {string} fileName
* @param {string} base64Data
* @returns {Promise<string>} File URL
*/
export async function uploadFileAttachment(fileName, base64Data) {
try {
const result = await fetch('/api/file/upload', {
method: 'POST',
headers: getRequestHeaders(),
body: JSON.stringify({
name: fileName,
data: base64Data,
}),
});
if (!result.ok) {
const error = await result.text();
throw new Error(error);
}
const responseData = await result.json();
return responseData.path.replace(/\\/g, '/');
} catch (error) {
toastr.error(error, 'Could not upload file');
console.error('Could not upload file', error);
}
}
/**
* Downloads file from the server.
* @param {string} url File URL
* @returns {Promise<string>} File text
*/
export async function getFileAttachment(url) {
try {
const result = await fetch(url, {
method: 'GET',
cache: 'force-cache',
headers: getRequestHeaders(),
});
if (!result.ok) {
const error = await result.text();
throw new Error(error);
}
const text = await result.text();
return text;
} catch (error) {
toastr.error(error, 'Could not download file');
console.error('Could not download file', error);
}
}
/**
* Validates file to make sure it is not binary or not image.
* @param {File} file File object
@ -121,7 +216,7 @@ async function validateFile(file) {
}
// If file is binary
if (isBinary && !isImage) {
if (isBinary && !isImage && !isConvertible(file.type)) {
toastr.error('Binary files are not supported. Select a text file or image.');
return false;
}
@ -193,22 +288,23 @@ async function deleteMessageFile(messageId) {
* @param {number} messageId Message ID
*/
async function viewMessageFile(messageId) {
const messageText = chat[messageId]?.extra?.file?.text;
const messageFile = chat[messageId]?.extra?.file;
if (!messageText) {
if (!messageFile) {
console.debug('Message has no file or it is empty');
return;
}
const fileText = messageFile.text || (await getFileAttachment(messageFile.url));
const modalTemplate = $('<div><pre><code></code></pre></div>');
modalTemplate.find('code').addClass('txt').text(messageText);
modalTemplate.find('code').addClass('txt').text(fileText);
modalTemplate.addClass('file_modal');
addCopyToCodeBlocks(modalTemplate);
callPopup(modalTemplate, 'text');
}
/**
* Inserts a file embed into the message.
* @param {number} messageId

View File

@ -1,6 +1,7 @@
import { getContext } from "./extensions.js";
import { getRequestHeaders } from "../script.js";
import { isMobile } from "./RossAscends-mods.js";
import { collapseNewlines } from "./power-user.js";
/**
* Pagination status string template.
@ -1066,3 +1067,99 @@ export function uuidv4() {
return v.toString(16);
});
}
function postProcessText(text) {
// Collapse multiple newlines into one
text = collapseNewlines(text);
// Trim leading and trailing whitespace, and remove empty lines
text = text.split('\n').map(l => l.trim()).filter(Boolean).join('\n');
// Remove carriage returns
text = text.replace(/\r/g, '');
// Normalize unicode spaces
text = text.replace(/\u00A0/g, ' ');
// Collapse multiple spaces into one (except for newlines)
text = text.replace(/ {2,}/g, ' ');
// Remove leading and trailing spaces
text = text.trim();
return text;
}
/**
* Use pdf.js to load and parse text from PDF pages
* @param {Blob} blob PDF file blob
* @returns {Promise<string>} A promise that resolves to the parsed text.
*/
export async function extractTextFromPDF(blob) {
async function initPdfJs() {
const promises = [];
const workerPromise = new Promise((resolve, reject) => {
const workerScript = document.createElement('script');
workerScript.type = 'module';
workerScript.async = true;
workerScript.src = 'lib/pdf.worker.mjs';
workerScript.onload = resolve;
workerScript.onerror = reject;
document.head.appendChild(workerScript);
});
promises.push(workerPromise);
const pdfjsPromise = new Promise((resolve, reject) => {
const pdfjsScript = document.createElement('script');
pdfjsScript.type = 'module';
pdfjsScript.async = true;
pdfjsScript.src = 'lib/pdf.mjs';
pdfjsScript.onload = resolve;
pdfjsScript.onerror = reject;
document.head.appendChild(pdfjsScript);
});
promises.push(pdfjsPromise);
return Promise.all(promises);
}
if (!('pdfjsLib' in window)) {
await initPdfJs();
}
const buffer = await getFileBuffer(blob);
const pdf = await pdfjsLib.getDocument(buffer).promise;
const pages = [];
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const text = textContent.items.map(item => item.str).join(' ');
pages.push(text);
}
return postProcessText(pages.join('\n'));
}
/**
* Use DOMParser to load and parse text from HTML
* @param {Blob} blob HTML content blob
* @returns {Promise<string>} A promise that resolves to the parsed text.
*/
export async function extractTextFromHTML(blob) {
const html = await blob.text();
const domParser = new DOMParser();
const document = domParser.parseFromString(DOMPurify.sanitize(html), 'text/html');
const text = postProcessText(document.body.textContent);
return text;
}
/**
* Use showdown to load and parse text from Markdown
* @param {Blob} blob Markdown content blob
* @returns {Promise<string>} A promise that resolves to the parsed text.
*/
export async function extractTextFromMarkdown(blob) {
const markdown = await blob.text();
const converter = new showdown.Converter();
const html = converter.makeHtml(markdown);
const domParser = new DOMParser();
const document = domParser.parseFromString(DOMPurify.sanitize(html), 'text/html');
const text = postProcessText(document.body.textContent);
return text;
}

View File

@ -3,6 +3,7 @@ const fs = require('fs');
const sanitize = require('sanitize-filename');
const fetch = require('node-fetch').default;
const { finished } = require('stream/promises');
const writeFileSyncAtomic = require('write-file-atomic').sync;
const { DIRECTORIES, UNSAFE_EXTENSIONS } = require('./constants');
const VALID_CATEGORIES = ["bgm", "ambient", "blip", "live2d"];
@ -297,6 +298,32 @@ function registerEndpoints(app, jsonParser) {
return response.sendStatus(500);
}
});
app.post('/api/file/upload', jsonParser, async (request, response) => {
try {
if (!request.body.name) {
return response.status(400).send("No upload name specified");
}
if (!request.body.data) {
return response.status(400).send("No upload data specified");
}
const safeInput = checkAssetFileName(request.body.name);
if (!safeInput) {
return response.status(400).send("Invalid upload name");
}
const pathToUpload = path.join(DIRECTORIES.files, safeInput);
writeFileSyncAtomic(pathToUpload, request.body.data, 'base64');
const url = path.normalize(pathToUpload.replace('public' + path.sep, ''));
return response.send({ path: url });
} catch (error) {
console.log(error);
return response.sendStatus(500);
}
});
}
module.exports = {

View File

@ -24,6 +24,7 @@ const DIRECTORIES = {
quickreplies: 'public/QuickReplies',
assets: 'public/assets',
comfyWorkflows: 'public/user/workflows',
files: 'public/user/files',
};
const UNSAFE_EXTENSIONS = [