Add proper LLaMA tokenizer and tokenizer switch. Remove legacy code.

This commit is contained in:
SillyLossy
2023-04-26 00:09:45 +03:00
parent 655885b1a2
commit 2ae28023c0
6 changed files with 77 additions and 193 deletions

View File

@ -1037,6 +1037,15 @@
</div> </div>
<div name="ContextFormatting"> <div name="ContextFormatting">
<h4>Context Formatting</h4> <h4>Context Formatting</h4>
<div>
<h4>Tokenizer</h4>
<select id="tokenizer">
<option value="0">None / Estimated</option>
<option value="1">GPT-3 (OpenAI)</option>
<option value="2">GPT-3 (Alternative / Classic)</option>
<option value="3">Sentencepiece (LLaMA)</option>
</select>
</div>
<label class="checkbox_label" for="always-force-name2-checkbox"> <label class="checkbox_label" for="always-force-name2-checkbox">
<input id="always-force-name2-checkbox" type="checkbox" /> <input id="always-force-name2-checkbox" type="checkbox" />
Always add character's name to prompt Always add character's name to prompt

View File

@ -48,6 +48,7 @@ import {
sortCharactersList, sortCharactersList,
power_user, power_user,
pygmalion_options, pygmalion_options,
tokenizers,
} from "./scripts/power-user.js"; } from "./scripts/power-user.js";
import { import {
@ -335,14 +336,31 @@ $(document).ajaxError(function myErrorHandler(_, xhr) {
}); });
function getTokenCount(str, padding = 0) { function getTokenCount(str, padding = 0) {
if (main_api == 'poe' || main_api == 'openai') { switch (power_user.tokenizer) {
case tokenizers.NONE:
return Math.ceil(str.length / CHARACTERS_PER_TOKEN_RATIO) + padding;
case tokenizers.GPT3:
return gpt3.encode(str).bpe.length + padding; return gpt3.encode(str).bpe.length + padding;
} case tokenizers.CLASSIC:
else {
return encode(str).length + padding; return encode(str).length + padding;
case tokenizers.LLAMA:
let tokenCount = 0;
jQuery.ajax({
async: false,
type: 'POST', //
url: `/tokenize_llama`,
data: JSON.stringify({ text: str }),
dataType: "json",
contentType: "application/json",
success: function (data) {
tokenCount = data.count;
}
});
return tokenCount + padding;
} }
} }
const CHARACTERS_PER_TOKEN_RATIO = 3.35;
const talkativeness_default = 0.5; const talkativeness_default = 0.5;
var is_advanced_char_open = false; var is_advanced_char_open = false;

View File

@ -6,8 +6,6 @@ import {
token, token,
getStatus, getStatus,
} from "../script.js"; } from "../script.js";
import { delay } from "./utils.js";
export { export {
loadPowerUserSettings, loadPowerUserSettings,
@ -16,6 +14,7 @@ export {
sortCharactersList, sortCharactersList,
power_user, power_user,
pygmalion_options, pygmalion_options,
tokenizers,
}; };
const avatar_styles = { const avatar_styles = {
@ -39,7 +38,15 @@ const pygmalion_options = {
ENABLED: 1, ENABLED: 1,
} }
const tokenizers = {
NONE: 0,
GPT3: 1,
CLASSIC: 2,
LLAMA: 3,
}
let power_user = { let power_user = {
tokenizer: tokenizers.CLASSIC,
collapse_newlines: false, collapse_newlines: false,
pygmalion_formatting: pygmalion_options.AUTO, pygmalion_formatting: pygmalion_options.AUTO,
pin_examples: false, pin_examples: false,
@ -97,7 +104,6 @@ const storage_keys = {
movingUI: "TavernAI_movingUI", movingUI: "TavernAI_movingUI",
}; };
const chat = document.getElementById('chat');
let browser_has_focus = true; let browser_has_focus = true;
function playMessageSound() { function playMessageSound() {
@ -260,6 +266,7 @@ function loadPowerUserSettings(settings, data) {
power_user.font_scale = Number(localStorage.getItem(storage_keys.font_scale) ?? 1); power_user.font_scale = Number(localStorage.getItem(storage_keys.font_scale) ?? 1);
power_user.blur_strength = Number(localStorage.getItem(storage_keys.blur_strength) ?? 10); power_user.blur_strength = Number(localStorage.getItem(storage_keys.blur_strength) ?? 10);
$(`#tokenizer option[value="${power_user.tokenizer}"]`).attr('selected', true);
$(`#pygmalion_formatting option[value=${power_user.pygmalion_formatting}]`).attr("selected", true); $(`#pygmalion_formatting option[value=${power_user.pygmalion_formatting}]`).attr("selected", true);
$("#collapse-newlines-checkbox").prop("checked", power_user.collapse_newlines); $("#collapse-newlines-checkbox").prop("checked", power_user.collapse_newlines);
$("#pin-examples-checkbox").prop("checked", power_user.pin_examples); $("#pin-examples-checkbox").prop("checked", power_user.pin_examples);
@ -584,6 +591,12 @@ $(document).ready(() => {
saveSettingsDebounced(); saveSettingsDebounced();
}); });
$("#tokenizer").on('change', function () {
const value = $(this).find(':selected').val();
power_user.tokenizer = Number(value);
saveSettingsDebounced();
});
$(window).on('focus', function () { $(window).on('focus', function () {
browser_has_focus = true; browser_has_focus = true;
}); });

206
server.js
View File

@ -119,6 +119,16 @@ let api_key_openai;
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)) const delay = ms => new Promise(resolve => setTimeout(resolve, ms))
const { SentencePieceProcessor, cleanText } = require("./src/sentencepiece/sentencepiece.min.js");
let spp = new SentencePieceProcessor();
async function countTokensLlama(text) {
let cleaned = cleanText(text);
let ids = spp.encodeIds(cleaned);
return ids.length;
}
function humanizedISO8601DateTime() { function humanizedISO8601DateTime() {
let baseDate = new Date(Date.now()); let baseDate = new Date(Date.now());
let humanYear = baseDate.getFullYear(); let humanYear = baseDate.getFullYear();
@ -2273,6 +2283,15 @@ app.post("/savepreset_openai", jsonParser, function (request, response) {
return response.send({ name }); return response.send({ name });
}); });
app.post("/tokenize_llama", jsonParser, async function (request, response) {
if (!request.body) {
return response.sendStatus(400);
}
const count = await countTokensLlama(request.body.text);
return response.send({ count });
});
// ** REST CLIENT ASYNC WRAPPERS ** // ** REST CLIENT ASYNC WRAPPERS **
function deleteAsync(url, args) { function deleteAsync(url, args) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
@ -2338,6 +2357,8 @@ const setupTasks = async function () {
// Colab users could run the embedded tool // Colab users could run the embedded tool
if (!is_colab) await convertWebp(); if (!is_colab) await convertWebp();
await spp.load(`./src/sentencepiece/tokenizer.model`);
console.log('Launching...'); console.log('Launching...');
if (autorun) open(autorunUrl.toString()); if (autorun) open(autorunUrl.toString());
@ -2346,10 +2367,6 @@ const setupTasks = async function () {
!config.whitelistMode && !config.whitelistMode &&
!config.basicAuthMode) !config.basicAuthMode)
console.log('Your SillyTavern is currently open to the public. To increase security, consider enabling whitelisting or basic authentication.') console.log('Your SillyTavern is currently open to the public. To increase security, consider enabling whitelisting or basic authentication.')
if (fs.existsSync('public/characters/update.txt') && !is_colab) {
convertStage1();
}
} }
if (true === cliArguments.ssl) if (true === cliArguments.ssl)
@ -2370,187 +2387,6 @@ else
setupTasks setupTasks
); );
//#####################CONVERTING IN NEW FORMAT########################
var charactersB = {};//B - Backup
var character_ib = 0;
var directoriesB = {};
function convertStage1() {
//if (!fs.existsSync('public/charactersBackup')) {
//fs.mkdirSync('public/charactersBackup');
//copyFolder('public/characters/', 'public/charactersBackup');
//}
var directories = getDirectories2("public/characters");
//console.log(directories[0]);
charactersB = {};
character_ib = 0;
var folderForDel = {};
getCharacterFile2(directories, 0);
}
function convertStage2() {
var mes = true;
for (const key in directoriesB) {
if (mes) {
console.log('***');
console.log('The update of the character format has begun...');
console.log('***');
mes = false;
}
var char = JSON.parse(charactersB[key]);
char.create_date = humanizedISO8601DateTime();
charactersB[key] = JSON.stringify(char);
var avatar = 'public/img/ai4.png';
if (char.avatar !== 'none') {
avatar = 'public/characters/' + char.name + '/avatars/' + char.avatar;
}
charaWrite(avatar, charactersB[key], directoriesB[key]);
const files = fs.readdirSync('public/characters/' + directoriesB[key] + '/chats');
if (!fs.existsSync(chatsPath + char.name)) {
fs.mkdirSync(chatsPath + char.name);
}
files.forEach(function (file) {
// Read the contents of the file
const fileContents = fs.readFileSync('public/characters/' + directoriesB[key] + '/chats/' + file, 'utf8');
// Iterate through the array of strings and parse each line as JSON
let chat_data = JSON.parse(fileContents);
let new_chat_data = [];
let this_chat_user_name = 'You';
let is_pass_0 = false;
if (chat_data[0].indexOf('<username-holder>') !== -1) {
this_chat_user_name = chat_data[0].substr('<username-holder>'.length, chat_data[0].length);
is_pass_0 = true;
}
let i = 0;
let ii = 0;
new_chat_data[i] = { user_name: 'You', character_name: char.name, create_date: humanizedISO8601DateTime() };
i++;
ii++;
chat_data.forEach(function (mes) {
if (!(i === 1 && is_pass_0)) {
if (mes.indexOf('<username-holder>') === -1 && mes.indexOf('<username-idkey>') === -1) {
new_chat_data[ii] = {};
let is_name = false;
if (mes.trim().indexOf(this_chat_user_name + ':') !== 0) {
if (mes.trim().indexOf(char.name + ':') === 0) {
mes = mes.replace(char.name + ':', '');
is_name = true;
}
new_chat_data[ii]['name'] = char.name;
new_chat_data[ii]['is_user'] = false;
new_chat_data[ii]['is_name'] = is_name;
new_chat_data[ii]['send_date'] = humanizedISO8601DateTime(); //Date.now();
} else {
mes = mes.replace(this_chat_user_name + ':', '');
new_chat_data[ii]['name'] = 'You';
new_chat_data[ii]['is_user'] = true;
new_chat_data[ii]['is_name'] = true;
new_chat_data[ii]['send_date'] = humanizedISO8601DateTime(); //Date.now();
}
new_chat_data[ii]['mes'] = mes.trim();
ii++;
}
}
i++;
});
const jsonlData = new_chat_data.map(JSON.stringify).join('\n');
// Write the contents to the destination folder
//console.log('convertstage2 writing a file: '+chatsPath+char.name+'/' + file+'l');
fs.writeFileSync(chatsPath + char.name + '/' + file + 'l', jsonlData);
});
//fs.rmSync('public/characters/'+directoriesB[key],{ recursive: true });
console.log(char.name + ' update!');
}
//removeFolders('public/characters');
fs.unlinkSync('public/characters/update.txt');
if (mes == false) {
console.log('***');
console.log('Сharacter format update completed successfully!');
console.log('***');
console.log('Now you can delete these folders, they are no longer used by TavernAI:');
}
for (const key in directoriesB) {
console.log('public/characters/' + directoriesB[key]);
}
}
function removeFolders(folder) {
const files = fs.readdirSync(folder);
files.forEach(function (file) {
const filePath = folder + '/' + file;
const stat = fs.statSync(filePath);
if (stat.isDirectory()) {
removeFolders(filePath);
fs.rmdirSync(filePath);
}
});
}
function copyFolder(src, dest) {
const files = fs.readdirSync(src);
files.forEach(function (file) {
const filePath = src + '/' + file;
const stat = fs.statSync(filePath);
if (stat.isFile()) {
fs.copyFileSync(filePath, dest + '/' + file);
} else if (stat.isDirectory()) {
fs.mkdirSync(dest + '/' + file);
copyFolder(filePath, dest + '/' + file);
}
});
}
function getDirectories2(path) {
return fs.readdirSync(path)
.filter(function (file) {
return fs.statSync(path + '/' + file).isDirectory();
})
.sort(function (a, b) {
return new Date(fs.statSync(path + '/' + a).mtime) - new Date(fs.statSync(path + '/' + b).mtime);
})
.reverse();
}
function getCharacterFile2(directories, i) {
if (directories.length > i) {
fs.stat('public/characters/' + directories[i] + '/' + directories[i] + ".json", function (err, stat) {
if (err == null) {
fs.readFile('public/characters/' + directories[i] + '/' + directories[i] + ".json", 'utf8', (err, data) => {
if (err) {
console.error(err);
return;
}
//console.log(data);
if (!fs.existsSync('public/characters/' + directories[i] + '.png')) {
charactersB[character_ib] = {};
charactersB[character_ib] = data;
directoriesB[character_ib] = directories[i];
character_ib++;
}
i++;
getCharacterFile2(directories, i);
});
} else {
i++;
getCharacterFile2(directories, i);
}
});
} else {
convertStage2();
}
}
async function convertWebp() { async function convertWebp() {
const files = fs.readdirSync(directories.characters).filter(e => e.endsWith(".webp")); const files = fs.readdirSync(directories.characters).filter(e => e.endsWith(".webp"));

File diff suppressed because one or more lines are too long

Binary file not shown.