perf: terminate tesseract worker after a delay (#1449)

fixes #1447
This commit is contained in:
Nolan Lawson 2019-08-29 08:51:41 -07:00 committed by GitHub
parent b01191037e
commit 56f266cb93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 8 deletions

View File

@ -1,5 +1,7 @@
import { importTesseractWorker } from '../_utils/asyncModules'
const DESTROY_WORKER_DELAY = 300000 // 5 minutes
// TODO: it's flaky to try to estimate tesseract's total progress this way
const steps = [
{ status: 'loading tesseract core', proportion: 0.05 },
@ -9,6 +11,36 @@ const steps = [
{ status: 'recognizing text', proportion: 0.6 }
]
let worker
let destroyWorkerHandle
async function initWorker () {
if (!worker) {
worker = (await importTesseractWorker())()
}
}
function destroyWorker () {
console.log('destroying tesseract worker')
if (worker) {
worker.terminate()
worker = null
}
}
// destroy the worker after a delay to reduce memory usage
function scheduleDestroyWorker () {
cancelDestroyWorker()
destroyWorkerHandle = setTimeout(destroyWorker, DESTROY_WORKER_DELAY)
}
function cancelDestroyWorker () {
if (destroyWorkerHandle) {
clearTimeout(destroyWorkerHandle)
destroyWorkerHandle = null
}
}
function getTotalProgress (progressInfo) {
const idx = steps.findIndex(({ status }) => progressInfo.status === status)
let total = 0
@ -19,9 +51,7 @@ function getTotalProgress (progressInfo) {
return total
}
export async function runTesseract (url, onProgress) {
const worker = await importTesseractWorker()
function recognize (url, onProgress) {
// TODO: have to trick tesseract into not creating a blob URL because that would break our CSP
// see https://github.com/naptha/tesseract.js/pull/322
let promise
@ -38,6 +68,16 @@ export async function runTesseract (url, onProgress) {
onProgress(getTotalProgress(progressInfo))
}
})
const res = await promise
return res.text
return promise
}
export async function runTesseract (url, onProgress) {
cancelDestroyWorker()
await initWorker()
try {
const { text } = await recognize(url, onProgress)
return text
} finally {
scheduleDestroyWorker()
}
}

View File

@ -12,10 +12,9 @@ import { TesseractWorker } from 'tesseract.js'
// which seems excessive. So we just live with the bug for now.
// https://github.com/naptha/tesseract.js/issues/325
const { origin } = location
const tesseractWorker = new TesseractWorker({
export default () => new TesseractWorker({
workerPath: `${origin}/${workerPath}`,
langPath: `${origin}/`,
corePath: `${origin}/${corePath}`
})
export default tesseractWorker