Install sentencepiece tokenizer from npm

This commit is contained in:
SillyLossy
2023-05-23 22:27:37 +03:00
parent f813d5c225
commit 4feebd0ba1
4 changed files with 44 additions and 17 deletions

17
package-lock.json generated
View File

@ -35,6 +35,7 @@
"png-chunks-extract": "^1.0.0",
"rimraf": "^3.0.2",
"sanitize-filename": "^1.6.3",
"sentencepiece-js": "^1.1.0",
"uniqolor": "^1.1.0",
"webp-converter": "2.3.2",
"ws": "^8.13.0",
@ -634,6 +635,14 @@
"version": "1.1.0",
"license": "MIT"
},
"node_modules/app-root-path": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/app-root-path/-/app-root-path-3.1.0.tgz",
"integrity": "sha512-biN3PwB2gUtjaYy/isrU3aNWI5w+fAfvHkSvCKeQGxhmYpwKFUxudR3Yya+KqVRHBmEDYh+/lTozYCFbmzX4nA==",
"engines": {
"node": ">= 6.0.0"
}
},
"node_modules/append-field": {
"version": "1.0.0",
"license": "MIT"
@ -2781,6 +2790,14 @@
"version": "2.1.3",
"license": "MIT"
},
"node_modules/sentencepiece-js": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/sentencepiece-js/-/sentencepiece-js-1.1.0.tgz",
"integrity": "sha512-HN6teKCRO9tz37zbaNI3i+vMZ/JRWDt6kmZ7OVpzQv1jZHyYNmf5tE7CFpIYN86+y9TLB0cuscMdA3OHhT/MhQ==",
"dependencies": {
"app-root-path": "^3.1.0"
}
},
"node_modules/serve-static": {
"version": "1.15.0",
"license": "MIT",

View File

@ -26,6 +26,7 @@
"png-chunks-extract": "^1.0.0",
"rimraf": "^3.0.2",
"sanitize-filename": "^1.6.3",
"sentencepiece-js": "^1.1.0",
"uniqolor": "^1.1.0",
"webp-converter": "2.3.2",
"ws": "^8.13.0",

View File

@ -133,10 +133,27 @@ let response_getstatus_openai;
const delay = ms => new Promise(resolve => setTimeout(resolve, ms))
const { SentencePieceProcessor, cleanText } = require("./src/sentencepiece/sentencepiece.min.js");
let spp = new SentencePieceProcessor();
const { SentencePieceProcessor, cleanText } = require("sentencepiece-js");
let spp;
async function loadSentencepieceTokenizer() {
try {
const spp = new SentencePieceProcessor();
await spp.load("src/sentencepiece/tokenizer.model");
return spp;
} catch (error) {
console.error("Sentencepiece tokenizer failed to load.");
return null;
}
};
async function countTokensLlama(text) {
// Fallback to strlen estimation
if (!spp) {
return Math.ceil(v.length / 3.35);
}
let cleaned = cleanText(text);
let ids = spp.encodeIds(cleaned);
@ -2676,7 +2693,7 @@ const setupTasks = async function () {
// Colab users could run the embedded tool
if (!is_colab) await convertWebp();
await spp.load(`./src/sentencepiece/tokenizer.model`);
spp = await loadSentencepieceTokenizer();
console.log('Launching...');

File diff suppressed because one or more lines are too long