Add EPUB import for data bank

This commit is contained in:
Cohee 2024-04-20 01:24:46 +03:00
parent 3ff5884112
commit 78d1d48ea9
6 changed files with 68 additions and 4 deletions

1
public/lib/epub.min.js vendored Normal file

File diff suppressed because one or more lines are too long

13
public/lib/jszip.min.js vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -27,6 +27,7 @@ import {
extractTextFromHTML,
extractTextFromMarkdown,
extractTextFromPDF,
extractTextFromEpub,
getBase64Async,
getStringHash,
humanFileSize,
@ -56,6 +57,7 @@ const converters = {
'application/pdf': extractTextFromPDF,
'text/html': extractTextFromHTML,
'text/markdown': extractTextFromMarkdown,
'application/epub+zip': extractTextFromEpub,
};
/**

View File

@ -7,8 +7,8 @@
<div data-i18n="These files will be available for extensions that support attachments (e.g. Vector Storage).">
These files will be available for extensions that support attachments (e.g. Vector Storage).
</div>
<div data-i18n="Supported file types: Plain Text, PDF, Markdown, HTML." class="marginTopBot5">
Supported file types: Plain Text, PDF, Markdown, HTML.
<div data-i18n="Supported file types: Plain Text, PDF, Markdown, HTML, EPUB." class="marginTopBot5">
Supported file types: Plain Text, PDF, Markdown, HTML, EPUB.
</div>
<div class="flex-container marginTopBot5">
<input type="search" id="attachmentSearch" class="attachmentSearch text_pole margin0 flex1" placeholder="Search...">

View File

@ -179,7 +179,7 @@ class FileScraper {
return new Promise(resolve => {
const fileInput = document.createElement('input');
fileInput.type = 'file';
fileInput.accept = '.txt, .md, .pdf, .html, .htm';
fileInput.accept = '.txt, .md, .pdf, .html, .htm, .epub';
fileInput.multiple = true;
fileInput.onchange = () => resolve(Array.from(fileInput.files));
fileInput.click();

View File

@ -695,7 +695,7 @@ export function splitRecursive(input, length, delimiters = ['\n\n', '\n', ' ', '
const flatParts = parts.flatMap(p => {
if (p.length < length) return p;
return splitRecursive(input, length, delimiters.slice(1));
return splitRecursive(p, length, delimiters.slice(1));
});
// Merge short chunks
@ -1300,6 +1300,54 @@ export async function extractTextFromMarkdown(blob) {
return text;
}
export async function extractTextFromEpub(blob) {
async function initEpubJs() {
const epubScript = new Promise((resolve, reject) => {
const epubScript = document.createElement('script');
epubScript.async = true;
epubScript.src = 'lib/epub.min.js';
epubScript.onload = resolve;
epubScript.onerror = reject;
document.head.appendChild(epubScript);
});
const jszipScript = new Promise((resolve, reject) => {
const jszipScript = document.createElement('script');
jszipScript.async = true;
jszipScript.src = 'lib/jszip.min.js';
jszipScript.onload = resolve;
jszipScript.onerror = reject;
document.head.appendChild(jszipScript);
});
return Promise.all([epubScript, jszipScript]);
}
if (!('ePub' in window)) {
await initEpubJs();
}
const book = ePub(blob);
await book.ready;
const sectionPromises = [];
book.spine.each((section) => {
const sectionPromise = (async () => {
const chapter = await book.load(section.href);
if (!(chapter instanceof Document) || !chapter.body?.textContent) {
return '';
}
return chapter.body.textContent.trim();
})();
sectionPromises.push(sectionPromise);
});
const content = await Promise.all(sectionPromises);
const text = content.filter(text => text);
return postProcessText(text.join('\n'), false);
}
/**
* Sets a value in an object by a path.
* @param {object} obj Object to set value in