Use mistral and yi tokenizers for custom token bans

This commit is contained in:
Cohee 2023-11-21 01:04:27 +02:00
parent 9b75e49b54
commit 1ebfddf07e
3 changed files with 13 additions and 3 deletions

View File

@ -1306,7 +1306,7 @@
<hr data-newbie-hidden class="width100p"> <hr data-newbie-hidden class="width100p">
<h4 class="range-block-title justifyCenter"> <h4 class="range-block-title justifyCenter">
<span data-i18n="Banned Tokens">Banned Tokens</span> <span data-i18n="Banned Tokens">Banned Tokens</span>
<div class="margin5 fa-solid fa-circle-info opacity50p " title="LLaMA models only.&#13;Sequences you don't want to appear in the output.&#13;One per line. Text or [token ids].&#13;Most tokens have a leading space."></div> <div class="margin5 fa-solid fa-circle-info opacity50p " title="LLaMA / Mistral / Yi models only. Make sure to select an appropriate tokenizer first.&#13;Sequences you don't want to appear in the output.&#13;One per line. Text or [token ids].&#13;Most tokens have a leading space. Use token counter if unsure."></div>
</h4> </h4>
<div class="wide100p"> <div class="wide100p">
<textarea id="banned_tokens_textgenerationwebui" class="text_pole textarea_compact" name="banned_tokens_textgenerationwebui" rows="3" placeholder="Example:&#10;some text&#10;[42, 69, 1337]"></textarea> <textarea id="banned_tokens_textgenerationwebui" class="text_pole textarea_compact" name="banned_tokens_textgenerationwebui" rows="3" placeholder="Example:&#10;some text&#10;[42, 69, 1337]"></textarea>

View File

@ -14,7 +14,7 @@ import {
power_user, power_user,
registerDebugFunction, registerDebugFunction,
} from "./power-user.js"; } from "./power-user.js";
import { getTextTokens, tokenizers } from "./tokenizers.js"; import { SENTENCEPIECE_TOKENIZERS, getTextTokens, tokenizers } from "./tokenizers.js";
import { onlyUnique } from "./utils.js"; import { onlyUnique } from "./utils.js";
export { export {
@ -187,6 +187,7 @@ function getCustomTokenBans() {
return ''; return '';
} }
const tokenizer = SENTENCEPIECE_TOKENIZERS.includes(power_user.tokenizer) ? power_user.tokenizer : tokenizers.LLAMA;
const result = []; const result = [];
const sequences = textgenerationwebui_settings.banned_tokens const sequences = textgenerationwebui_settings.banned_tokens
.split('\n') .split('\n')
@ -218,7 +219,7 @@ function getCustomTokenBans() {
} }
} else { } else {
try { try {
const tokens = getTextTokens(tokenizers.LLAMA, line); const tokens = getTextTokens(tokenizer, line);
result.push(...tokens); result.push(...tokens);
} catch { } catch {
console.log(`Could not tokenize raw text: ${line}`); console.log(`Could not tokenize raw text: ${line}`);

View File

@ -22,6 +22,15 @@ export const tokenizers = {
BEST_MATCH: 99, BEST_MATCH: 99,
}; };
export const SENTENCEPIECE_TOKENIZERS = [
tokenizers.LLAMA,
tokenizers.MISTRAL,
tokenizers.YI,
// uncomment when NovelAI releases Kayra and Clio weights, lol
//tokenizers.NERD,
//tokenizers.NERD2,
];
const objectStore = new localforage.createInstance({ name: "SillyTavern_ChatCompletions" }); const objectStore = new localforage.createInstance({ name: "SillyTavern_ChatCompletions" });
let tokenCache = {}; let tokenCache = {};