mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-01-20 13:38:49 +01:00
Add token ids viewer to tokenizer plugin
This commit is contained in:
parent
632d55f6af
commit
f0c0949aa0
@ -3935,7 +3935,7 @@ function promptItemize(itemizedPrompts, requestedMesId) {
|
||||
var promptBiasTokensPercentage = ((oaiBiasTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||
var worldInfoStringTokensPercentage = ((worldInfoStringTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||
var allAnchorsTokensPercentage = ((allAnchorsTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||
var selectedTokenizer = `tiktoken (${getTokenizerModel()})`;
|
||||
var selectedTokenizer = getTokenizerModel();
|
||||
var oaiSystemTokens = oaiImpersonateTokens + oaiJailbreakTokens + oaiNudgeTokens + oaiStartTokens + oaiNsfwTokens + oaiMainTokens;
|
||||
var oaiSystemTokensPercentage = ((oaiSystemTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||
|
||||
|
@ -1,12 +1,21 @@
|
||||
import { callPopup, main_api } from "../../../script.js";
|
||||
import { getContext } from "../../extensions.js";
|
||||
import { registerSlashCommand } from "../../slash-commands.js";
|
||||
import { getTokenCount, getTokenizerModel } from "../../tokenizers.js";
|
||||
import { getTextTokens, getTokenCount, getTokenizerBestMatch, getTokenizerModel, tokenizers } from "../../tokenizers.js";
|
||||
|
||||
async function doTokenCounter() {
|
||||
const tokenizerOption = $("#tokenizer").find(':selected');
|
||||
let tokenizerId = Number(tokenizerOption.val());
|
||||
let tokenizerName = tokenizerOption.text();
|
||||
|
||||
if (main_api !== 'openai' && tokenizerId === tokenizers.BEST_MATCH) {
|
||||
tokenizerId = getTokenizerBestMatch();
|
||||
tokenizerName = $(`#tokenizer option[value="${tokenizerId}"]`).text();
|
||||
}
|
||||
|
||||
const selectedTokenizer = main_api == 'openai'
|
||||
? `tiktoken (${getTokenizerModel()})`
|
||||
: $("#tokenizer").find(':selected').text();
|
||||
? getTokenizerModel()
|
||||
: tokenizerName;
|
||||
const html = `
|
||||
<div class="wide100p">
|
||||
<h3>Token Counter</h3>
|
||||
@ -15,15 +24,26 @@ async function doTokenCounter() {
|
||||
<p>Selected tokenizer: ${selectedTokenizer}</p>
|
||||
<textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="20"></textarea>
|
||||
<div>Tokens: <span id="token_counter_result">0</span></div>
|
||||
<br>
|
||||
<div>Token IDs (if applicable):</div>
|
||||
<textarea id="token_counter_ids" disabled rows="10"></textarea>
|
||||
</div>
|
||||
</div>`;
|
||||
|
||||
const dialog = $(html);
|
||||
dialog.find('#token_counter_textarea').on('input', () => {
|
||||
const text = $('#token_counter_textarea').val();
|
||||
const context = getContext();
|
||||
const count = context.getTokenCount(text);
|
||||
$('#token_counter_result').text(count);
|
||||
const text = String($('#token_counter_textarea').val());
|
||||
const ids = main_api == 'openai' ? getTextTokens(tokenizers.OPENAI, text) : getTextTokens(tokenizerId, text);
|
||||
|
||||
if (Array.isArray(ids) && ids.length > 0) {
|
||||
$('#token_counter_ids').text(JSON.stringify(ids));
|
||||
$('#token_counter_result').text(ids.length);
|
||||
} else {
|
||||
const context = getContext();
|
||||
const count = context.getTokenCount(text);
|
||||
$('#token_counter_ids').text('—');
|
||||
$('#token_counter_result').text(count);
|
||||
}
|
||||
});
|
||||
|
||||
$('#dialogue_popup').addClass('wide_dialogue_popup');
|
||||
|
@ -11,10 +11,7 @@ const TOKENIZER_WARNING_KEY = 'tokenizationWarningShown';
|
||||
export const tokenizers = {
|
||||
NONE: 0,
|
||||
GPT2: 1,
|
||||
/**
|
||||
* @deprecated Use GPT2 instead.
|
||||
*/
|
||||
LEGACY: 2,
|
||||
OPENAI: 2,
|
||||
LLAMA: 3,
|
||||
NERD: 4,
|
||||
NERD2: 5,
|
||||
@ -65,7 +62,7 @@ async function resetTokenCache() {
|
||||
}
|
||||
}
|
||||
|
||||
function getTokenizerBestMatch() {
|
||||
export function getTokenizerBestMatch() {
|
||||
if (main_api === 'novel') {
|
||||
if (nai_settings.model_novel.includes('clio')) {
|
||||
return tokenizers.NERD;
|
||||
@ -363,9 +360,14 @@ function countTokensRemote(endpoint, str, padding) {
|
||||
* Calls the underlying tokenizer model to encode a string to tokens.
|
||||
* @param {string} endpoint API endpoint.
|
||||
* @param {string} str String to tokenize.
|
||||
* @param {string} model Tokenizer model.
|
||||
* @returns {number[]} Array of token ids.
|
||||
*/
|
||||
function getTextTokensRemote(endpoint, str) {
|
||||
function getTextTokensRemote(endpoint, str, model = '') {
|
||||
if (model) {
|
||||
endpoint += `?model=${model}`;
|
||||
}
|
||||
|
||||
let ids = [];
|
||||
jQuery.ajax({
|
||||
async: false,
|
||||
@ -418,6 +420,9 @@ export function getTextTokens(tokenizerType, str) {
|
||||
return getTextTokensRemote('/api/tokenize/nerdstash', str);
|
||||
case tokenizers.NERD2:
|
||||
return getTextTokensRemote('/api/tokenize/nerdstash_v2', str);
|
||||
case tokenizers.OPENAI:
|
||||
const model = getTokenizerModel();
|
||||
return getTextTokensRemote('/api/tokenize/openai-encode', str, model);
|
||||
default:
|
||||
console.warn("Calling getTextTokens with unsupported tokenizer type", tokenizerType);
|
||||
return [];
|
||||
|
@ -292,6 +292,30 @@ function registerEndpoints(app, jsonParser) {
|
||||
app.post("/api/decode/nerdstash_v2", jsonParser, createSentencepieceDecodingHandler(() => spp_nerd_v2));
|
||||
app.post("/api/decode/gpt2", jsonParser, createTiktokenDecodingHandler('gpt2'));
|
||||
|
||||
app.post("/api/tokenize/openai-encode", jsonParser, async function (req, res) {
|
||||
try {
|
||||
const queryModel = String(req.query.model || '');
|
||||
|
||||
if (queryModel.includes('llama')) {
|
||||
const handler = createSentencepieceEncodingHandler(() => spp_llama);
|
||||
return handler(req, res);
|
||||
}
|
||||
|
||||
if (queryModel.includes('claude')) {
|
||||
const text = req.body.text || '';
|
||||
const tokens = Object.values(claude_tokenizer.encode(text));
|
||||
return res.send({ ids: tokens, count: tokens.length });
|
||||
}
|
||||
|
||||
const model = getTokenizerModel(queryModel);
|
||||
const handler = createTiktokenEncodingHandler(model);
|
||||
return handler(req, res);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
return res.send({ ids: [], count: 0 });
|
||||
}
|
||||
});
|
||||
|
||||
app.post("/api/tokenize/openai", jsonParser, async function (req, res) {
|
||||
try {
|
||||
if (!req.body) return res.sendStatus(400);
|
||||
|
Loading…
Reference in New Issue
Block a user