Add token ids viewer to tokenizer plugin

This commit is contained in:
Cohee 2023-11-05 22:45:37 +02:00
parent 632d55f6af
commit f0c0949aa0
4 changed files with 63 additions and 14 deletions

View File

@ -3935,7 +3935,7 @@ function promptItemize(itemizedPrompts, requestedMesId) {
var promptBiasTokensPercentage = ((oaiBiasTokens / (finalPromptTokens)) * 100).toFixed(2);
var worldInfoStringTokensPercentage = ((worldInfoStringTokens / (finalPromptTokens)) * 100).toFixed(2);
var allAnchorsTokensPercentage = ((allAnchorsTokens / (finalPromptTokens)) * 100).toFixed(2);
var selectedTokenizer = `tiktoken (${getTokenizerModel()})`;
var selectedTokenizer = getTokenizerModel();
var oaiSystemTokens = oaiImpersonateTokens + oaiJailbreakTokens + oaiNudgeTokens + oaiStartTokens + oaiNsfwTokens + oaiMainTokens;
var oaiSystemTokensPercentage = ((oaiSystemTokens / (finalPromptTokens)) * 100).toFixed(2);

View File

@ -1,12 +1,21 @@
import { callPopup, main_api } from "../../../script.js";
import { getContext } from "../../extensions.js";
import { registerSlashCommand } from "../../slash-commands.js";
import { getTokenCount, getTokenizerModel } from "../../tokenizers.js";
import { getTextTokens, getTokenCount, getTokenizerBestMatch, getTokenizerModel, tokenizers } from "../../tokenizers.js";
async function doTokenCounter() {
const tokenizerOption = $("#tokenizer").find(':selected');
let tokenizerId = Number(tokenizerOption.val());
let tokenizerName = tokenizerOption.text();
if (main_api !== 'openai' && tokenizerId === tokenizers.BEST_MATCH) {
tokenizerId = getTokenizerBestMatch();
tokenizerName = $(`#tokenizer option[value="${tokenizerId}"]`).text();
}
const selectedTokenizer = main_api == 'openai'
? `tiktoken (${getTokenizerModel()})`
: $("#tokenizer").find(':selected').text();
? getTokenizerModel()
: tokenizerName;
const html = `
<div class="wide100p">
<h3>Token Counter</h3>
@ -15,15 +24,26 @@ async function doTokenCounter() {
<p>Selected tokenizer: ${selectedTokenizer}</p>
<textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="20"></textarea>
<div>Tokens: <span id="token_counter_result">0</span></div>
<br>
<div>Token IDs (if applicable):</div>
<textarea id="token_counter_ids" disabled rows="10"></textarea>
</div>
</div>`;
const dialog = $(html);
dialog.find('#token_counter_textarea').on('input', () => {
const text = $('#token_counter_textarea').val();
const text = String($('#token_counter_textarea').val());
const ids = main_api == 'openai' ? getTextTokens(tokenizers.OPENAI, text) : getTextTokens(tokenizerId, text);
if (Array.isArray(ids) && ids.length > 0) {
$('#token_counter_ids').text(JSON.stringify(ids));
$('#token_counter_result').text(ids.length);
} else {
const context = getContext();
const count = context.getTokenCount(text);
$('#token_counter_ids').text('—');
$('#token_counter_result').text(count);
}
});
$('#dialogue_popup').addClass('wide_dialogue_popup');

View File

@ -11,10 +11,7 @@ const TOKENIZER_WARNING_KEY = 'tokenizationWarningShown';
export const tokenizers = {
NONE: 0,
GPT2: 1,
/**
* @deprecated Use GPT2 instead.
*/
LEGACY: 2,
OPENAI: 2,
LLAMA: 3,
NERD: 4,
NERD2: 5,
@ -65,7 +62,7 @@ async function resetTokenCache() {
}
}
function getTokenizerBestMatch() {
export function getTokenizerBestMatch() {
if (main_api === 'novel') {
if (nai_settings.model_novel.includes('clio')) {
return tokenizers.NERD;
@ -363,9 +360,14 @@ function countTokensRemote(endpoint, str, padding) {
* Calls the underlying tokenizer model to encode a string to tokens.
* @param {string} endpoint API endpoint.
* @param {string} str String to tokenize.
* @param {string} model Tokenizer model.
* @returns {number[]} Array of token ids.
*/
function getTextTokensRemote(endpoint, str) {
function getTextTokensRemote(endpoint, str, model = '') {
if (model) {
endpoint += `?model=${model}`;
}
let ids = [];
jQuery.ajax({
async: false,
@ -418,6 +420,9 @@ export function getTextTokens(tokenizerType, str) {
return getTextTokensRemote('/api/tokenize/nerdstash', str);
case tokenizers.NERD2:
return getTextTokensRemote('/api/tokenize/nerdstash_v2', str);
case tokenizers.OPENAI:
const model = getTokenizerModel();
return getTextTokensRemote('/api/tokenize/openai-encode', str, model);
default:
console.warn("Calling getTextTokens with unsupported tokenizer type", tokenizerType);
return [];

View File

@ -292,6 +292,30 @@ function registerEndpoints(app, jsonParser) {
app.post("/api/decode/nerdstash_v2", jsonParser, createSentencepieceDecodingHandler(() => spp_nerd_v2));
app.post("/api/decode/gpt2", jsonParser, createTiktokenDecodingHandler('gpt2'));
app.post("/api/tokenize/openai-encode", jsonParser, async function (req, res) {
try {
const queryModel = String(req.query.model || '');
if (queryModel.includes('llama')) {
const handler = createSentencepieceEncodingHandler(() => spp_llama);
return handler(req, res);
}
if (queryModel.includes('claude')) {
const text = req.body.text || '';
const tokens = Object.values(claude_tokenizer.encode(text));
return res.send({ ids: tokens, count: tokens.length });
}
const model = getTokenizerModel(queryModel);
const handler = createTiktokenEncodingHandler(model);
return handler(req, res);
} catch (error) {
console.log(error);
return res.send({ ids: [], count: 0 });
}
});
app.post("/api/tokenize/openai", jsonParser, async function (req, res) {
try {
if (!req.body) return res.sendStatus(400);