Use Readability to extract text from HTML

This commit is contained in:
Cohee
2024-02-29 16:37:52 +02:00
parent a2ac659056
commit eaeafde0e4
5 changed files with 2453 additions and 10 deletions

View File

@ -1111,11 +1111,13 @@ export function uuidv4() {
});
}
function postProcessText(text) {
function postProcessText(text, collapse = true) {
// Collapse multiple newlines into one
text = collapseNewlines(text);
// Trim leading and trailing whitespace, and remove empty lines
text = text.split('\n').map(l => l.trim()).filter(Boolean).join('\n');
if (collapse) {
text = collapseNewlines(text);
// Trim leading and trailing whitespace, and remove empty lines
text = text.split('\n').map(l => l.trim()).filter(Boolean).join('\n');
}
// Remove carriage returns
text = text.replace(/\r/g, '');
// Normalize unicode spaces
@ -1127,6 +1129,25 @@ function postProcessText(text) {
return text;
}
/**
* Uses Readability.js to parse the text from a web page.
* @param {Document} document HTML document
* @param {string} [textSelector='body'] The fallback selector for the text to parse.
* @returns {Promise<string>} A promise that resolves to the parsed text.
*/
export async function getReadableText(document, textSelector = 'body') {
if (isProbablyReaderable(document)) {
const parser = new Readability(document);
const article = parser.parse();
return postProcessText(article.textContent, false);
}
const elements = document.querySelectorAll(textSelector);
const rawText = Array.from(elements).map(e => e.textContent).join('\n');
const text = postProcessText(rawText);
return text;
}
/**
* Use pdf.js to load and parse text from PDF pages
* @param {Blob} blob PDF file blob
@ -1188,10 +1209,7 @@ export async function extractTextFromHTML(blob, textSelector = 'body') {
const html = await blob.text();
const domParser = new DOMParser();
const document = domParser.parseFromString(DOMPurify.sanitize(html), 'text/html');
const elements = document.querySelectorAll(textSelector);
const rawText = Array.from(elements).map(e => e.textContent).join('\n');
const text = postProcessText(rawText);
return text;
return await getReadableText(document, textSelector);
}
/**
@ -1205,6 +1223,6 @@ export async function extractTextFromMarkdown(blob) {
const html = converter.makeHtml(markdown);
const domParser = new DOMParser();
const document = domParser.parseFromString(DOMPurify.sanitize(html), 'text/html');
const text = postProcessText(document.body.textContent);
const text = postProcessText(document.body.textContent, false);
return text;
}