[FEATURE_REQUEST] Sending PDF/HTML files? #1414

2025-06-05 21:59:27 +02:00 · 2023-11-29 17:51:30 +02:00
parent 1ce009b84e
commit e0bf2b8e3e
10 changed files with 74770 additions and 19 deletions
--- a/public/scripts/utils.js
+++ b/public/scripts/utils.js
@ -1,6 +1,7 @@
 import { getContext } from "./extensions.js";
 import { getRequestHeaders } from "../script.js";
 import { isMobile } from "./RossAscends-mods.js";
+import { collapseNewlines } from "./power-user.js";

 /**
 * Pagination status string template.
@ -1066,3 +1067,99 @@ export function uuidv4() {
        return v.toString(16);
    });
 }
+
+function postProcessText(text) {
+    // Collapse multiple newlines into one
+    text = collapseNewlines(text);
+    // Trim leading and trailing whitespace, and remove empty lines
+    text = text.split('\n').map(l => l.trim()).filter(Boolean).join('\n');
+    // Remove carriage returns
+    text = text.replace(/\r/g, '');
+    // Normalize unicode spaces
+    text = text.replace(/\u00A0/g, ' ');
+    // Collapse multiple spaces into one (except for newlines)
+    text = text.replace(/ {2,}/g, ' ');
+    // Remove leading and trailing spaces
+    text = text.trim();
+    return text;
+}
+
+/**
+ * Use pdf.js to load and parse text from PDF pages
+ * @param {Blob} blob PDF file blob
+ * @returns {Promise<string>} A promise that resolves to the parsed text.
+ */
+export async function extractTextFromPDF(blob) {
+    async function initPdfJs() {
+        const promises = [];
+
+        const workerPromise = new Promise((resolve, reject) => {
+            const workerScript = document.createElement('script');
+            workerScript.type = 'module';
+            workerScript.async = true;
+            workerScript.src = 'lib/pdf.worker.mjs';
+            workerScript.onload = resolve;
+            workerScript.onerror = reject;
+            document.head.appendChild(workerScript);
+        });
+
+        promises.push(workerPromise);
+
+        const pdfjsPromise = new Promise((resolve, reject) => {
+            const pdfjsScript = document.createElement('script');
+            pdfjsScript.type = 'module';
+            pdfjsScript.async = true;
+            pdfjsScript.src = 'lib/pdf.mjs';
+            pdfjsScript.onload = resolve;
+            pdfjsScript.onerror = reject;
+            document.head.appendChild(pdfjsScript);
+        });
+
+        promises.push(pdfjsPromise);
+
+        return Promise.all(promises);
+    }
+
+    if (!('pdfjsLib' in window)) {
+        await initPdfJs();
+    }
+
+    const buffer = await getFileBuffer(blob);
+    const pdf = await pdfjsLib.getDocument(buffer).promise;
+    const pages = [];
+    for (let i = 1; i <= pdf.numPages; i++) {
+        const page = await pdf.getPage(i);
+        const textContent = await page.getTextContent();
+        const text = textContent.items.map(item => item.str).join(' ');
+        pages.push(text);
+    }
+    return postProcessText(pages.join('\n'));
+}
+
+/**
+ * Use DOMParser to load and parse text from HTML
+ * @param {Blob} blob HTML content blob
+ * @returns {Promise<string>} A promise that resolves to the parsed text.
+ */
+export async function extractTextFromHTML(blob) {
+    const html = await blob.text();
+    const domParser = new DOMParser();
+    const document = domParser.parseFromString(DOMPurify.sanitize(html), 'text/html');
+    const text = postProcessText(document.body.textContent);
+    return text;
+}
+
+/**
+ * Use showdown to load and parse text from Markdown
+ * @param {Blob} blob Markdown content blob
+ * @returns {Promise<string>} A promise that resolves to the parsed text.
+ */
+export async function extractTextFromMarkdown(blob) {
+    const markdown = await blob.text();
+    const converter = new showdown.Converter();
+    const html = converter.makeHtml(markdown);
+    const domParser = new DOMParser();
+    const document = domParser.parseFromString(DOMPurify.sanitize(html), 'text/html');
+    const text = postProcessText(document.body.textContent);
+    return text;
+}