mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Add proper LLaMA tokenizer and tokenizer switch. Remove legacy code.
This commit is contained in:
@ -1037,6 +1037,15 @@
|
|||||||
</div>
|
</div>
|
||||||
<div name="ContextFormatting">
|
<div name="ContextFormatting">
|
||||||
<h4>Context Formatting</h4>
|
<h4>Context Formatting</h4>
|
||||||
|
<div>
|
||||||
|
<h4>Tokenizer</h4>
|
||||||
|
<select id="tokenizer">
|
||||||
|
<option value="0">None / Estimated</option>
|
||||||
|
<option value="1">GPT-3 (OpenAI)</option>
|
||||||
|
<option value="2">GPT-3 (Alternative / Classic)</option>
|
||||||
|
<option value="3">Sentencepiece (LLaMA)</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
<label class="checkbox_label" for="always-force-name2-checkbox">
|
<label class="checkbox_label" for="always-force-name2-checkbox">
|
||||||
<input id="always-force-name2-checkbox" type="checkbox" />
|
<input id="always-force-name2-checkbox" type="checkbox" />
|
||||||
Always add character's name to prompt
|
Always add character's name to prompt
|
||||||
|
@ -48,6 +48,7 @@ import {
|
|||||||
sortCharactersList,
|
sortCharactersList,
|
||||||
power_user,
|
power_user,
|
||||||
pygmalion_options,
|
pygmalion_options,
|
||||||
|
tokenizers,
|
||||||
} from "./scripts/power-user.js";
|
} from "./scripts/power-user.js";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
@ -335,14 +336,31 @@ $(document).ajaxError(function myErrorHandler(_, xhr) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
function getTokenCount(str, padding = 0) {
|
function getTokenCount(str, padding = 0) {
|
||||||
if (main_api == 'poe' || main_api == 'openai') {
|
switch (power_user.tokenizer) {
|
||||||
|
case tokenizers.NONE:
|
||||||
|
return Math.ceil(str.length / CHARACTERS_PER_TOKEN_RATIO) + padding;
|
||||||
|
case tokenizers.GPT3:
|
||||||
return gpt3.encode(str).bpe.length + padding;
|
return gpt3.encode(str).bpe.length + padding;
|
||||||
}
|
case tokenizers.CLASSIC:
|
||||||
else {
|
|
||||||
return encode(str).length + padding;
|
return encode(str).length + padding;
|
||||||
|
case tokenizers.LLAMA:
|
||||||
|
let tokenCount = 0;
|
||||||
|
jQuery.ajax({
|
||||||
|
async: false,
|
||||||
|
type: 'POST', //
|
||||||
|
url: `/tokenize_llama`,
|
||||||
|
data: JSON.stringify({ text: str }),
|
||||||
|
dataType: "json",
|
||||||
|
contentType: "application/json",
|
||||||
|
success: function (data) {
|
||||||
|
tokenCount = data.count;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return tokenCount + padding;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const CHARACTERS_PER_TOKEN_RATIO = 3.35;
|
||||||
const talkativeness_default = 0.5;
|
const talkativeness_default = 0.5;
|
||||||
|
|
||||||
var is_advanced_char_open = false;
|
var is_advanced_char_open = false;
|
||||||
|
@ -6,8 +6,6 @@ import {
|
|||||||
token,
|
token,
|
||||||
getStatus,
|
getStatus,
|
||||||
} from "../script.js";
|
} from "../script.js";
|
||||||
import { delay } from "./utils.js";
|
|
||||||
|
|
||||||
|
|
||||||
export {
|
export {
|
||||||
loadPowerUserSettings,
|
loadPowerUserSettings,
|
||||||
@ -16,6 +14,7 @@ export {
|
|||||||
sortCharactersList,
|
sortCharactersList,
|
||||||
power_user,
|
power_user,
|
||||||
pygmalion_options,
|
pygmalion_options,
|
||||||
|
tokenizers,
|
||||||
};
|
};
|
||||||
|
|
||||||
const avatar_styles = {
|
const avatar_styles = {
|
||||||
@ -39,7 +38,15 @@ const pygmalion_options = {
|
|||||||
ENABLED: 1,
|
ENABLED: 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const tokenizers = {
|
||||||
|
NONE: 0,
|
||||||
|
GPT3: 1,
|
||||||
|
CLASSIC: 2,
|
||||||
|
LLAMA: 3,
|
||||||
|
}
|
||||||
|
|
||||||
let power_user = {
|
let power_user = {
|
||||||
|
tokenizer: tokenizers.CLASSIC,
|
||||||
collapse_newlines: false,
|
collapse_newlines: false,
|
||||||
pygmalion_formatting: pygmalion_options.AUTO,
|
pygmalion_formatting: pygmalion_options.AUTO,
|
||||||
pin_examples: false,
|
pin_examples: false,
|
||||||
@ -97,7 +104,6 @@ const storage_keys = {
|
|||||||
movingUI: "TavernAI_movingUI",
|
movingUI: "TavernAI_movingUI",
|
||||||
};
|
};
|
||||||
|
|
||||||
const chat = document.getElementById('chat');
|
|
||||||
let browser_has_focus = true;
|
let browser_has_focus = true;
|
||||||
|
|
||||||
function playMessageSound() {
|
function playMessageSound() {
|
||||||
@ -260,6 +266,7 @@ function loadPowerUserSettings(settings, data) {
|
|||||||
power_user.font_scale = Number(localStorage.getItem(storage_keys.font_scale) ?? 1);
|
power_user.font_scale = Number(localStorage.getItem(storage_keys.font_scale) ?? 1);
|
||||||
power_user.blur_strength = Number(localStorage.getItem(storage_keys.blur_strength) ?? 10);
|
power_user.blur_strength = Number(localStorage.getItem(storage_keys.blur_strength) ?? 10);
|
||||||
|
|
||||||
|
$(`#tokenizer option[value="${power_user.tokenizer}"]`).attr('selected', true);
|
||||||
$(`#pygmalion_formatting option[value=${power_user.pygmalion_formatting}]`).attr("selected", true);
|
$(`#pygmalion_formatting option[value=${power_user.pygmalion_formatting}]`).attr("selected", true);
|
||||||
$("#collapse-newlines-checkbox").prop("checked", power_user.collapse_newlines);
|
$("#collapse-newlines-checkbox").prop("checked", power_user.collapse_newlines);
|
||||||
$("#pin-examples-checkbox").prop("checked", power_user.pin_examples);
|
$("#pin-examples-checkbox").prop("checked", power_user.pin_examples);
|
||||||
@ -584,6 +591,12 @@ $(document).ready(() => {
|
|||||||
saveSettingsDebounced();
|
saveSettingsDebounced();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
$("#tokenizer").on('change', function () {
|
||||||
|
const value = $(this).find(':selected').val();
|
||||||
|
power_user.tokenizer = Number(value);
|
||||||
|
saveSettingsDebounced();
|
||||||
|
});
|
||||||
|
|
||||||
$(window).on('focus', function () {
|
$(window).on('focus', function () {
|
||||||
browser_has_focus = true;
|
browser_has_focus = true;
|
||||||
});
|
});
|
||||||
|
206
server.js
206
server.js
@ -119,6 +119,16 @@ let api_key_openai;
|
|||||||
|
|
||||||
const delay = ms => new Promise(resolve => setTimeout(resolve, ms))
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms))
|
||||||
|
|
||||||
|
const { SentencePieceProcessor, cleanText } = require("./src/sentencepiece/sentencepiece.min.js");
|
||||||
|
let spp = new SentencePieceProcessor();
|
||||||
|
|
||||||
|
async function countTokensLlama(text) {
|
||||||
|
let cleaned = cleanText(text);
|
||||||
|
|
||||||
|
let ids = spp.encodeIds(cleaned);
|
||||||
|
return ids.length;
|
||||||
|
}
|
||||||
|
|
||||||
function humanizedISO8601DateTime() {
|
function humanizedISO8601DateTime() {
|
||||||
let baseDate = new Date(Date.now());
|
let baseDate = new Date(Date.now());
|
||||||
let humanYear = baseDate.getFullYear();
|
let humanYear = baseDate.getFullYear();
|
||||||
@ -2273,6 +2283,15 @@ app.post("/savepreset_openai", jsonParser, function (request, response) {
|
|||||||
return response.send({ name });
|
return response.send({ name });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.post("/tokenize_llama", jsonParser, async function (request, response) {
|
||||||
|
if (!request.body) {
|
||||||
|
return response.sendStatus(400);
|
||||||
|
}
|
||||||
|
|
||||||
|
const count = await countTokensLlama(request.body.text);
|
||||||
|
return response.send({ count });
|
||||||
|
});
|
||||||
|
|
||||||
// ** REST CLIENT ASYNC WRAPPERS **
|
// ** REST CLIENT ASYNC WRAPPERS **
|
||||||
function deleteAsync(url, args) {
|
function deleteAsync(url, args) {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
@ -2338,6 +2357,8 @@ const setupTasks = async function () {
|
|||||||
// Colab users could run the embedded tool
|
// Colab users could run the embedded tool
|
||||||
if (!is_colab) await convertWebp();
|
if (!is_colab) await convertWebp();
|
||||||
|
|
||||||
|
await spp.load(`./src/sentencepiece/tokenizer.model`);
|
||||||
|
|
||||||
console.log('Launching...');
|
console.log('Launching...');
|
||||||
|
|
||||||
if (autorun) open(autorunUrl.toString());
|
if (autorun) open(autorunUrl.toString());
|
||||||
@ -2346,10 +2367,6 @@ const setupTasks = async function () {
|
|||||||
!config.whitelistMode &&
|
!config.whitelistMode &&
|
||||||
!config.basicAuthMode)
|
!config.basicAuthMode)
|
||||||
console.log('Your SillyTavern is currently open to the public. To increase security, consider enabling whitelisting or basic authentication.')
|
console.log('Your SillyTavern is currently open to the public. To increase security, consider enabling whitelisting or basic authentication.')
|
||||||
|
|
||||||
if (fs.existsSync('public/characters/update.txt') && !is_colab) {
|
|
||||||
convertStage1();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (true === cliArguments.ssl)
|
if (true === cliArguments.ssl)
|
||||||
@ -2370,187 +2387,6 @@ else
|
|||||||
setupTasks
|
setupTasks
|
||||||
);
|
);
|
||||||
|
|
||||||
//#####################CONVERTING IN NEW FORMAT########################
|
|
||||||
|
|
||||||
var charactersB = {};//B - Backup
|
|
||||||
var character_ib = 0;
|
|
||||||
|
|
||||||
var directoriesB = {};
|
|
||||||
|
|
||||||
|
|
||||||
function convertStage1() {
|
|
||||||
//if (!fs.existsSync('public/charactersBackup')) {
|
|
||||||
//fs.mkdirSync('public/charactersBackup');
|
|
||||||
//copyFolder('public/characters/', 'public/charactersBackup');
|
|
||||||
//}
|
|
||||||
|
|
||||||
var directories = getDirectories2("public/characters");
|
|
||||||
//console.log(directories[0]);
|
|
||||||
charactersB = {};
|
|
||||||
character_ib = 0;
|
|
||||||
var folderForDel = {};
|
|
||||||
getCharacterFile2(directories, 0);
|
|
||||||
}
|
|
||||||
function convertStage2() {
|
|
||||||
var mes = true;
|
|
||||||
for (const key in directoriesB) {
|
|
||||||
if (mes) {
|
|
||||||
console.log('***');
|
|
||||||
console.log('The update of the character format has begun...');
|
|
||||||
console.log('***');
|
|
||||||
mes = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var char = JSON.parse(charactersB[key]);
|
|
||||||
char.create_date = humanizedISO8601DateTime();
|
|
||||||
charactersB[key] = JSON.stringify(char);
|
|
||||||
var avatar = 'public/img/ai4.png';
|
|
||||||
if (char.avatar !== 'none') {
|
|
||||||
avatar = 'public/characters/' + char.name + '/avatars/' + char.avatar;
|
|
||||||
}
|
|
||||||
|
|
||||||
charaWrite(avatar, charactersB[key], directoriesB[key]);
|
|
||||||
|
|
||||||
const files = fs.readdirSync('public/characters/' + directoriesB[key] + '/chats');
|
|
||||||
if (!fs.existsSync(chatsPath + char.name)) {
|
|
||||||
fs.mkdirSync(chatsPath + char.name);
|
|
||||||
}
|
|
||||||
files.forEach(function (file) {
|
|
||||||
// Read the contents of the file
|
|
||||||
|
|
||||||
const fileContents = fs.readFileSync('public/characters/' + directoriesB[key] + '/chats/' + file, 'utf8');
|
|
||||||
|
|
||||||
|
|
||||||
// Iterate through the array of strings and parse each line as JSON
|
|
||||||
let chat_data = JSON.parse(fileContents);
|
|
||||||
let new_chat_data = [];
|
|
||||||
let this_chat_user_name = 'You';
|
|
||||||
let is_pass_0 = false;
|
|
||||||
if (chat_data[0].indexOf('<username-holder>') !== -1) {
|
|
||||||
this_chat_user_name = chat_data[0].substr('<username-holder>'.length, chat_data[0].length);
|
|
||||||
is_pass_0 = true;
|
|
||||||
}
|
|
||||||
let i = 0;
|
|
||||||
let ii = 0;
|
|
||||||
new_chat_data[i] = { user_name: 'You', character_name: char.name, create_date: humanizedISO8601DateTime() };
|
|
||||||
i++;
|
|
||||||
ii++;
|
|
||||||
chat_data.forEach(function (mes) {
|
|
||||||
if (!(i === 1 && is_pass_0)) {
|
|
||||||
if (mes.indexOf('<username-holder>') === -1 && mes.indexOf('<username-idkey>') === -1) {
|
|
||||||
new_chat_data[ii] = {};
|
|
||||||
let is_name = false;
|
|
||||||
if (mes.trim().indexOf(this_chat_user_name + ':') !== 0) {
|
|
||||||
if (mes.trim().indexOf(char.name + ':') === 0) {
|
|
||||||
mes = mes.replace(char.name + ':', '');
|
|
||||||
is_name = true;
|
|
||||||
}
|
|
||||||
new_chat_data[ii]['name'] = char.name;
|
|
||||||
new_chat_data[ii]['is_user'] = false;
|
|
||||||
new_chat_data[ii]['is_name'] = is_name;
|
|
||||||
new_chat_data[ii]['send_date'] = humanizedISO8601DateTime(); //Date.now();
|
|
||||||
|
|
||||||
} else {
|
|
||||||
mes = mes.replace(this_chat_user_name + ':', '');
|
|
||||||
new_chat_data[ii]['name'] = 'You';
|
|
||||||
new_chat_data[ii]['is_user'] = true;
|
|
||||||
new_chat_data[ii]['is_name'] = true;
|
|
||||||
new_chat_data[ii]['send_date'] = humanizedISO8601DateTime(); //Date.now();
|
|
||||||
|
|
||||||
}
|
|
||||||
new_chat_data[ii]['mes'] = mes.trim();
|
|
||||||
ii++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
|
|
||||||
});
|
|
||||||
const jsonlData = new_chat_data.map(JSON.stringify).join('\n');
|
|
||||||
// Write the contents to the destination folder
|
|
||||||
//console.log('convertstage2 writing a file: '+chatsPath+char.name+'/' + file+'l');
|
|
||||||
fs.writeFileSync(chatsPath + char.name + '/' + file + 'l', jsonlData);
|
|
||||||
});
|
|
||||||
//fs.rmSync('public/characters/'+directoriesB[key],{ recursive: true });
|
|
||||||
console.log(char.name + ' update!');
|
|
||||||
}
|
|
||||||
//removeFolders('public/characters');
|
|
||||||
fs.unlinkSync('public/characters/update.txt');
|
|
||||||
if (mes == false) {
|
|
||||||
console.log('***');
|
|
||||||
console.log('Сharacter format update completed successfully!');
|
|
||||||
console.log('***');
|
|
||||||
console.log('Now you can delete these folders, they are no longer used by TavernAI:');
|
|
||||||
}
|
|
||||||
for (const key in directoriesB) {
|
|
||||||
console.log('public/characters/' + directoriesB[key]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
function removeFolders(folder) {
|
|
||||||
const files = fs.readdirSync(folder);
|
|
||||||
files.forEach(function (file) {
|
|
||||||
const filePath = folder + '/' + file;
|
|
||||||
const stat = fs.statSync(filePath);
|
|
||||||
if (stat.isDirectory()) {
|
|
||||||
removeFolders(filePath);
|
|
||||||
fs.rmdirSync(filePath);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function copyFolder(src, dest) {
|
|
||||||
const files = fs.readdirSync(src);
|
|
||||||
files.forEach(function (file) {
|
|
||||||
const filePath = src + '/' + file;
|
|
||||||
const stat = fs.statSync(filePath);
|
|
||||||
if (stat.isFile()) {
|
|
||||||
fs.copyFileSync(filePath, dest + '/' + file);
|
|
||||||
} else if (stat.isDirectory()) {
|
|
||||||
fs.mkdirSync(dest + '/' + file);
|
|
||||||
copyFolder(filePath, dest + '/' + file);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function getDirectories2(path) {
|
|
||||||
return fs.readdirSync(path)
|
|
||||||
.filter(function (file) {
|
|
||||||
return fs.statSync(path + '/' + file).isDirectory();
|
|
||||||
})
|
|
||||||
.sort(function (a, b) {
|
|
||||||
return new Date(fs.statSync(path + '/' + a).mtime) - new Date(fs.statSync(path + '/' + b).mtime);
|
|
||||||
})
|
|
||||||
.reverse();
|
|
||||||
}
|
|
||||||
function getCharacterFile2(directories, i) {
|
|
||||||
if (directories.length > i) {
|
|
||||||
fs.stat('public/characters/' + directories[i] + '/' + directories[i] + ".json", function (err, stat) {
|
|
||||||
if (err == null) {
|
|
||||||
fs.readFile('public/characters/' + directories[i] + '/' + directories[i] + ".json", 'utf8', (err, data) => {
|
|
||||||
if (err) {
|
|
||||||
console.error(err);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
//console.log(data);
|
|
||||||
if (!fs.existsSync('public/characters/' + directories[i] + '.png')) {
|
|
||||||
charactersB[character_ib] = {};
|
|
||||||
charactersB[character_ib] = data;
|
|
||||||
directoriesB[character_ib] = directories[i];
|
|
||||||
character_ib++;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
getCharacterFile2(directories, i);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
i++;
|
|
||||||
getCharacterFile2(directories, i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
convertStage2();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function convertWebp() {
|
async function convertWebp() {
|
||||||
const files = fs.readdirSync(directories.characters).filter(e => e.endsWith(".webp"));
|
const files = fs.readdirSync(directories.characters).filter(e => e.endsWith(".webp"));
|
||||||
|
|
||||||
|
8
src/sentencepiece/sentencepiece.min.js
vendored
Normal file
8
src/sentencepiece/sentencepiece.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
BIN
src/sentencepiece/tokenizer.model
Normal file
BIN
src/sentencepiece/tokenizer.model
Normal file
Binary file not shown.
Reference in New Issue
Block a user