mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-02-26 08:58:09 +01:00
Add text chunks display to token counter
This commit is contained in:
parent
f248367ca3
commit
e8ba328a14
@ -22,11 +22,15 @@ async function doTokenCounter() {
|
|||||||
<div class="justifyLeft">
|
<div class="justifyLeft">
|
||||||
<h4>Type / paste in the box below to see the number of tokens in the text.</h4>
|
<h4>Type / paste in the box below to see the number of tokens in the text.</h4>
|
||||||
<p>Selected tokenizer: ${selectedTokenizer}</p>
|
<p>Selected tokenizer: ${selectedTokenizer}</p>
|
||||||
<textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="15"></textarea>
|
<div>Input:</div>
|
||||||
|
<textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="10"></textarea>
|
||||||
<div>Tokens: <span id="token_counter_result">0</span></div>
|
<div>Tokens: <span id="token_counter_result">0</span></div>
|
||||||
<br>
|
<br>
|
||||||
<div>Token IDs (if applicable):</div>
|
<div>Tokenized text:</div>
|
||||||
<textarea id="token_counter_ids" disabled rows="10"></textarea>
|
<div id="tokenized_chunks_display" class="wide100p">—</div>
|
||||||
|
<br>
|
||||||
|
<div>Token IDs:</div>
|
||||||
|
<textarea id="token_counter_ids" disabled rows="10">—</textarea>
|
||||||
</div>
|
</div>
|
||||||
</div>`;
|
</div>`;
|
||||||
|
|
||||||
@ -36,13 +40,18 @@ async function doTokenCounter() {
|
|||||||
const ids = main_api == 'openai' ? getTextTokens(tokenizers.OPENAI, text) : getTextTokens(tokenizerId, text);
|
const ids = main_api == 'openai' ? getTextTokens(tokenizers.OPENAI, text) : getTextTokens(tokenizerId, text);
|
||||||
|
|
||||||
if (Array.isArray(ids) && ids.length > 0) {
|
if (Array.isArray(ids) && ids.length > 0) {
|
||||||
$('#token_counter_ids').text(JSON.stringify(ids));
|
$('#token_counter_ids').text(`[${ids.join(', ')}]`);
|
||||||
$('#token_counter_result').text(ids.length);
|
$('#token_counter_result').text(ids.length);
|
||||||
|
|
||||||
|
if (Object.hasOwnProperty.call(ids, 'chunks')) {
|
||||||
|
drawChunks(Object.getOwnPropertyDescriptor(ids, 'chunks').value, ids);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const context = getContext();
|
const context = getContext();
|
||||||
const count = context.getTokenCount(text);
|
const count = context.getTokenCount(text);
|
||||||
$('#token_counter_ids').text('—');
|
$('#token_counter_ids').text('—');
|
||||||
$('#token_counter_result').text(count);
|
$('#token_counter_result').text(count);
|
||||||
|
$('#tokenized_chunks_display').text('—');
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -50,6 +59,44 @@ async function doTokenCounter() {
|
|||||||
callPopup(dialog, 'text', '', { wide: true, large: true });
|
callPopup(dialog, 'text', '', { wide: true, large: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Draws the tokenized chunks in the UI
|
||||||
|
* @param {string[]} chunks
|
||||||
|
* @param {number[]} ids
|
||||||
|
*/
|
||||||
|
function drawChunks(chunks, ids) {
|
||||||
|
const pastelRainbow = [
|
||||||
|
'#FFB3BA',
|
||||||
|
'#FFDFBA',
|
||||||
|
'#FFFFBA',
|
||||||
|
'#BFFFBF',
|
||||||
|
'#BAE1FF',
|
||||||
|
'#FFBAF3',
|
||||||
|
];
|
||||||
|
$('#tokenized_chunks_display').empty();
|
||||||
|
|
||||||
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
let chunk = chunks[i].replace(/▁/g, ' '); // This is a leading space in sentencepiece. More info: Lower one eighth block (U+2581)
|
||||||
|
|
||||||
|
// If <0xHEX>, decode it
|
||||||
|
if (/^<0x[0-9A-F]+>$/i.test(chunk)) {
|
||||||
|
const code = parseInt(chunk.substring(3, chunk.length - 1), 16);
|
||||||
|
chunk = String.fromCodePoint(code);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If newline - insert a line break
|
||||||
|
if (chunk === '\n') {
|
||||||
|
$('#tokenized_chunks_display').append('<br>');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const color = pastelRainbow[i % pastelRainbow.length];
|
||||||
|
const chunkHtml = $(`<code style="background-color: ${color};">${chunk}</code>`);
|
||||||
|
chunkHtml.attr('title', ids[i]);
|
||||||
|
$('#tokenized_chunks_display').append(chunkHtml);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function doCount() {
|
function doCount() {
|
||||||
// get all of the messages in the chat
|
// get all of the messages in the chat
|
||||||
const context = getContext();
|
const context = getContext();
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
#tokenized_chunks_display > code {
|
||||||
|
color: black;
|
||||||
|
text-shadow: none;
|
||||||
|
}
|
@ -385,6 +385,11 @@ function getTextTokensRemote(endpoint, str, model = '') {
|
|||||||
contentType: "application/json",
|
contentType: "application/json",
|
||||||
success: function (data) {
|
success: function (data) {
|
||||||
ids = data.ids;
|
ids = data.ids;
|
||||||
|
|
||||||
|
// Don't want to break reverse compatibility, so sprinkle in some of the JS magic
|
||||||
|
if (Array.isArray(data.chunks)) {
|
||||||
|
Object.defineProperty(ids, 'chunks', { value: data.chunks });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return ids;
|
return ids;
|
||||||
|
@ -78,6 +78,39 @@ async function countSentencepieceTokens(spp, text) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function countSentencepieceArrayTokens(tokenizer, array) {
|
||||||
|
const jsonBody = array.flatMap(x => Object.values(x)).join('\n\n');
|
||||||
|
const result = await countSentencepieceTokens(tokenizer, jsonBody);
|
||||||
|
const num_tokens = result.count;
|
||||||
|
return num_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getTiktokenChunks(tokenizer, ids) {
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
const chunks = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < ids.length; i++) {
|
||||||
|
const id = ids[i];
|
||||||
|
const chunkTextBytes = await tokenizer.decode(new Uint32Array([id]));
|
||||||
|
const chunkText = decoder.decode(chunkTextBytes);
|
||||||
|
chunks.push(chunkText);
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getWebTokenizersChunks(tokenizer, ids) {
|
||||||
|
const chunks = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < ids.length; i++) {
|
||||||
|
const id = ids[i];
|
||||||
|
const chunkText = await tokenizer.decode(new Uint32Array([id]));
|
||||||
|
chunks.push(chunkText);
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the tokenizer model by the model name.
|
* Gets the tokenizer model by the model name.
|
||||||
* @param {string} requestModel Models to use for tokenization
|
* @param {string} requestModel Models to use for tokenization
|
||||||
@ -169,10 +202,11 @@ function createSentencepieceEncodingHandler(getTokenizerFn) {
|
|||||||
const text = request.body.text || '';
|
const text = request.body.text || '';
|
||||||
const tokenizer = getTokenizerFn();
|
const tokenizer = getTokenizerFn();
|
||||||
const { ids, count } = await countSentencepieceTokens(tokenizer, text);
|
const { ids, count } = await countSentencepieceTokens(tokenizer, text);
|
||||||
return response.send({ ids, count });
|
const chunks = await tokenizer.encodePieces(text);
|
||||||
|
return response.send({ ids, count, chunks });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
return response.send({ ids: [], count: 0 });
|
return response.send({ ids: [], count: 0, chunks: [] });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -215,10 +249,11 @@ function createTiktokenEncodingHandler(modelId) {
|
|||||||
const text = request.body.text || '';
|
const text = request.body.text || '';
|
||||||
const tokenizer = getTiktokenTokenizer(modelId);
|
const tokenizer = getTiktokenTokenizer(modelId);
|
||||||
const tokens = Object.values(tokenizer.encode(text));
|
const tokens = Object.values(tokenizer.encode(text));
|
||||||
return response.send({ ids: tokens, count: tokens.length });
|
const chunks = await getTiktokenChunks(tokenizer, tokens);
|
||||||
|
return response.send({ ids: tokens, count: tokens.length, chunks });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
return response.send({ ids: [], count: 0 });
|
return response.send({ ids: [], count: 0, chunks: [] });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -317,7 +352,8 @@ function registerEndpoints(app, jsonParser) {
|
|||||||
if (queryModel.includes('claude')) {
|
if (queryModel.includes('claude')) {
|
||||||
const text = req.body.text || '';
|
const text = req.body.text || '';
|
||||||
const tokens = Object.values(claude_tokenizer.encode(text));
|
const tokens = Object.values(claude_tokenizer.encode(text));
|
||||||
return res.send({ ids: tokens, count: tokens.length });
|
const chunks = await getWebTokenizersChunks(claude_tokenizer, tokens);
|
||||||
|
return res.send({ ids: tokens, count: tokens.length, chunks });
|
||||||
}
|
}
|
||||||
|
|
||||||
const model = getTokenizerModel(queryModel);
|
const model = getTokenizerModel(queryModel);
|
||||||
@ -325,7 +361,7 @@ function registerEndpoints(app, jsonParser) {
|
|||||||
return handler(req, res);
|
return handler(req, res);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
return res.send({ ids: [], count: 0 });
|
return res.send({ ids: [], count: 0, chunks: [] });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -343,16 +379,12 @@ function registerEndpoints(app, jsonParser) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (model == 'llama') {
|
if (model == 'llama') {
|
||||||
const jsonBody = req.body.flatMap(x => Object.values(x)).join('\n\n');
|
num_tokens = await countSentencepieceArrayTokens(spp_llama, req.body);
|
||||||
const llamaResult = await countSentencepieceTokens(spp_llama, jsonBody);
|
|
||||||
num_tokens = llamaResult.count;
|
|
||||||
return res.send({ "token_count": num_tokens });
|
return res.send({ "token_count": num_tokens });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model == 'mistral') {
|
if (model == 'mistral') {
|
||||||
const jsonBody = req.body.flatMap(x => Object.values(x)).join('\n\n');
|
num_tokens = await countSentencepieceArrayTokens(spp_mistral, req.body);
|
||||||
const mistralResult = await countSentencepieceTokens(spp_mistral, jsonBody);
|
|
||||||
num_tokens = mistralResult.count;
|
|
||||||
return res.send({ "token_count": num_tokens });
|
return res.send({ "token_count": num_tokens });
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -407,3 +439,4 @@ module.exports = {
|
|||||||
loadTokenizers,
|
loadTokenizers,
|
||||||
registerEndpoints,
|
registerEndpoints,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user