Add token ids viewer to tokenizer plugin
This commit is contained in:
parent
632d55f6af
commit
f0c0949aa0
|
@ -3935,7 +3935,7 @@ function promptItemize(itemizedPrompts, requestedMesId) {
|
||||||
var promptBiasTokensPercentage = ((oaiBiasTokens / (finalPromptTokens)) * 100).toFixed(2);
|
var promptBiasTokensPercentage = ((oaiBiasTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||||
var worldInfoStringTokensPercentage = ((worldInfoStringTokens / (finalPromptTokens)) * 100).toFixed(2);
|
var worldInfoStringTokensPercentage = ((worldInfoStringTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||||
var allAnchorsTokensPercentage = ((allAnchorsTokens / (finalPromptTokens)) * 100).toFixed(2);
|
var allAnchorsTokensPercentage = ((allAnchorsTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||||
var selectedTokenizer = `tiktoken (${getTokenizerModel()})`;
|
var selectedTokenizer = getTokenizerModel();
|
||||||
var oaiSystemTokens = oaiImpersonateTokens + oaiJailbreakTokens + oaiNudgeTokens + oaiStartTokens + oaiNsfwTokens + oaiMainTokens;
|
var oaiSystemTokens = oaiImpersonateTokens + oaiJailbreakTokens + oaiNudgeTokens + oaiStartTokens + oaiNsfwTokens + oaiMainTokens;
|
||||||
var oaiSystemTokensPercentage = ((oaiSystemTokens / (finalPromptTokens)) * 100).toFixed(2);
|
var oaiSystemTokensPercentage = ((oaiSystemTokens / (finalPromptTokens)) * 100).toFixed(2);
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,21 @@
|
||||||
import { callPopup, main_api } from "../../../script.js";
|
import { callPopup, main_api } from "../../../script.js";
|
||||||
import { getContext } from "../../extensions.js";
|
import { getContext } from "../../extensions.js";
|
||||||
import { registerSlashCommand } from "../../slash-commands.js";
|
import { registerSlashCommand } from "../../slash-commands.js";
|
||||||
import { getTokenCount, getTokenizerModel } from "../../tokenizers.js";
|
import { getTextTokens, getTokenCount, getTokenizerBestMatch, getTokenizerModel, tokenizers } from "../../tokenizers.js";
|
||||||
|
|
||||||
async function doTokenCounter() {
|
async function doTokenCounter() {
|
||||||
|
const tokenizerOption = $("#tokenizer").find(':selected');
|
||||||
|
let tokenizerId = Number(tokenizerOption.val());
|
||||||
|
let tokenizerName = tokenizerOption.text();
|
||||||
|
|
||||||
|
if (main_api !== 'openai' && tokenizerId === tokenizers.BEST_MATCH) {
|
||||||
|
tokenizerId = getTokenizerBestMatch();
|
||||||
|
tokenizerName = $(`#tokenizer option[value="${tokenizerId}"]`).text();
|
||||||
|
}
|
||||||
|
|
||||||
const selectedTokenizer = main_api == 'openai'
|
const selectedTokenizer = main_api == 'openai'
|
||||||
? `tiktoken (${getTokenizerModel()})`
|
? getTokenizerModel()
|
||||||
: $("#tokenizer").find(':selected').text();
|
: tokenizerName;
|
||||||
const html = `
|
const html = `
|
||||||
<div class="wide100p">
|
<div class="wide100p">
|
||||||
<h3>Token Counter</h3>
|
<h3>Token Counter</h3>
|
||||||
|
@ -15,15 +24,26 @@ async function doTokenCounter() {
|
||||||
<p>Selected tokenizer: ${selectedTokenizer}</p>
|
<p>Selected tokenizer: ${selectedTokenizer}</p>
|
||||||
<textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="20"></textarea>
|
<textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="20"></textarea>
|
||||||
<div>Tokens: <span id="token_counter_result">0</span></div>
|
<div>Tokens: <span id="token_counter_result">0</span></div>
|
||||||
|
<br>
|
||||||
|
<div>Token IDs (if applicable):</div>
|
||||||
|
<textarea id="token_counter_ids" disabled rows="10"></textarea>
|
||||||
</div>
|
</div>
|
||||||
</div>`;
|
</div>`;
|
||||||
|
|
||||||
const dialog = $(html);
|
const dialog = $(html);
|
||||||
dialog.find('#token_counter_textarea').on('input', () => {
|
dialog.find('#token_counter_textarea').on('input', () => {
|
||||||
const text = $('#token_counter_textarea').val();
|
const text = String($('#token_counter_textarea').val());
|
||||||
|
const ids = main_api == 'openai' ? getTextTokens(tokenizers.OPENAI, text) : getTextTokens(tokenizerId, text);
|
||||||
|
|
||||||
|
if (Array.isArray(ids) && ids.length > 0) {
|
||||||
|
$('#token_counter_ids').text(JSON.stringify(ids));
|
||||||
|
$('#token_counter_result').text(ids.length);
|
||||||
|
} else {
|
||||||
const context = getContext();
|
const context = getContext();
|
||||||
const count = context.getTokenCount(text);
|
const count = context.getTokenCount(text);
|
||||||
|
$('#token_counter_ids').text('—');
|
||||||
$('#token_counter_result').text(count);
|
$('#token_counter_result').text(count);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
$('#dialogue_popup').addClass('wide_dialogue_popup');
|
$('#dialogue_popup').addClass('wide_dialogue_popup');
|
||||||
|
|
|
@ -11,10 +11,7 @@ const TOKENIZER_WARNING_KEY = 'tokenizationWarningShown';
|
||||||
export const tokenizers = {
|
export const tokenizers = {
|
||||||
NONE: 0,
|
NONE: 0,
|
||||||
GPT2: 1,
|
GPT2: 1,
|
||||||
/**
|
OPENAI: 2,
|
||||||
* @deprecated Use GPT2 instead.
|
|
||||||
*/
|
|
||||||
LEGACY: 2,
|
|
||||||
LLAMA: 3,
|
LLAMA: 3,
|
||||||
NERD: 4,
|
NERD: 4,
|
||||||
NERD2: 5,
|
NERD2: 5,
|
||||||
|
@ -65,7 +62,7 @@ async function resetTokenCache() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function getTokenizerBestMatch() {
|
export function getTokenizerBestMatch() {
|
||||||
if (main_api === 'novel') {
|
if (main_api === 'novel') {
|
||||||
if (nai_settings.model_novel.includes('clio')) {
|
if (nai_settings.model_novel.includes('clio')) {
|
||||||
return tokenizers.NERD;
|
return tokenizers.NERD;
|
||||||
|
@ -363,9 +360,14 @@ function countTokensRemote(endpoint, str, padding) {
|
||||||
* Calls the underlying tokenizer model to encode a string to tokens.
|
* Calls the underlying tokenizer model to encode a string to tokens.
|
||||||
* @param {string} endpoint API endpoint.
|
* @param {string} endpoint API endpoint.
|
||||||
* @param {string} str String to tokenize.
|
* @param {string} str String to tokenize.
|
||||||
|
* @param {string} model Tokenizer model.
|
||||||
* @returns {number[]} Array of token ids.
|
* @returns {number[]} Array of token ids.
|
||||||
*/
|
*/
|
||||||
function getTextTokensRemote(endpoint, str) {
|
function getTextTokensRemote(endpoint, str, model = '') {
|
||||||
|
if (model) {
|
||||||
|
endpoint += `?model=${model}`;
|
||||||
|
}
|
||||||
|
|
||||||
let ids = [];
|
let ids = [];
|
||||||
jQuery.ajax({
|
jQuery.ajax({
|
||||||
async: false,
|
async: false,
|
||||||
|
@ -418,6 +420,9 @@ export function getTextTokens(tokenizerType, str) {
|
||||||
return getTextTokensRemote('/api/tokenize/nerdstash', str);
|
return getTextTokensRemote('/api/tokenize/nerdstash', str);
|
||||||
case tokenizers.NERD2:
|
case tokenizers.NERD2:
|
||||||
return getTextTokensRemote('/api/tokenize/nerdstash_v2', str);
|
return getTextTokensRemote('/api/tokenize/nerdstash_v2', str);
|
||||||
|
case tokenizers.OPENAI:
|
||||||
|
const model = getTokenizerModel();
|
||||||
|
return getTextTokensRemote('/api/tokenize/openai-encode', str, model);
|
||||||
default:
|
default:
|
||||||
console.warn("Calling getTextTokens with unsupported tokenizer type", tokenizerType);
|
console.warn("Calling getTextTokens with unsupported tokenizer type", tokenizerType);
|
||||||
return [];
|
return [];
|
||||||
|
|
|
@ -292,6 +292,30 @@ function registerEndpoints(app, jsonParser) {
|
||||||
app.post("/api/decode/nerdstash_v2", jsonParser, createSentencepieceDecodingHandler(() => spp_nerd_v2));
|
app.post("/api/decode/nerdstash_v2", jsonParser, createSentencepieceDecodingHandler(() => spp_nerd_v2));
|
||||||
app.post("/api/decode/gpt2", jsonParser, createTiktokenDecodingHandler('gpt2'));
|
app.post("/api/decode/gpt2", jsonParser, createTiktokenDecodingHandler('gpt2'));
|
||||||
|
|
||||||
|
app.post("/api/tokenize/openai-encode", jsonParser, async function (req, res) {
|
||||||
|
try {
|
||||||
|
const queryModel = String(req.query.model || '');
|
||||||
|
|
||||||
|
if (queryModel.includes('llama')) {
|
||||||
|
const handler = createSentencepieceEncodingHandler(() => spp_llama);
|
||||||
|
return handler(req, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (queryModel.includes('claude')) {
|
||||||
|
const text = req.body.text || '';
|
||||||
|
const tokens = Object.values(claude_tokenizer.encode(text));
|
||||||
|
return res.send({ ids: tokens, count: tokens.length });
|
||||||
|
}
|
||||||
|
|
||||||
|
const model = getTokenizerModel(queryModel);
|
||||||
|
const handler = createTiktokenEncodingHandler(model);
|
||||||
|
return handler(req, res);
|
||||||
|
} catch (error) {
|
||||||
|
console.log(error);
|
||||||
|
return res.send({ ids: [], count: 0 });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
app.post("/api/tokenize/openai", jsonParser, async function (req, res) {
|
app.post("/api/tokenize/openai", jsonParser, async function (req, res) {
|
||||||
try {
|
try {
|
||||||
if (!req.body) return res.sendStatus(400);
|
if (!req.body) return res.sendStatus(400);
|
||||||
|
|
Loading…
Reference in New Issue