Add text chunks display to token counter

2025-06-05 21:59:27 +02:00 · 2023-11-06 02:42:51 +02:00
parent f248367ca3
commit e8ba328a14
4 changed files with 105 additions and 16 deletions
--- a/public/scripts/extensions/token-counter/index.js
+++ b/public/scripts/extensions/token-counter/index.js
@@ -22,11 +22,15 @@ async function doTokenCounter() {
        <div class="justifyLeft">
            <h4>Type / paste in the box below to see the number of tokens in the text.</h4>
            <p>Selected tokenizer: ${selectedTokenizer}</p>
-            <textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="15"></textarea>
+            <div>Input:</div>
            <textarea id="token_counter_textarea" class="wide100p textarea_compact margin-bot-10px" rows="10"></textarea>
            <div>Tokens: <span id="token_counter_result">0</span></div>
            <br>
-            <div>Token IDs (if applicable):</div>
+            <div>Tokenized text:</div>
-            <textarea id="token_counter_ids" disabled rows="10"></textarea>
+            <div id="tokenized_chunks_display" class="wide100p">—</div>
            <br>
            <div>Token IDs:</div>
            <textarea id="token_counter_ids" disabled rows="10">—</textarea>
        </div>
    </div>`;
@@ -36,13 +40,18 @@ async function doTokenCounter() {
        const ids = main_api == 'openai' ? getTextTokens(tokenizers.OPENAI, text) : getTextTokens(tokenizerId, text);
        if (Array.isArray(ids) && ids.length > 0) {
-            $('#token_counter_ids').text(JSON.stringify(ids));
+            $('#token_counter_ids').text(`[${ids.join(', ')}]`);
            $('#token_counter_result').text(ids.length);
            if (Object.hasOwnProperty.call(ids, 'chunks')) {
                drawChunks(Object.getOwnPropertyDescriptor(ids, 'chunks').value, ids);
            }
        } else {
            const context = getContext();
            const count = context.getTokenCount(text);
            $('#token_counter_ids').text('—');
            $('#token_counter_result').text(count);
            $('#tokenized_chunks_display').text('—');
        }
    });
@@ -50,6 +59,44 @@ async function doTokenCounter() {
    callPopup(dialog, 'text', '', { wide: true, large: true });
 }
 /**
 * Draws the tokenized chunks in the UI
 * @param {string[]} chunks
 * @param {number[]} ids
 */
 function drawChunks(chunks, ids) {
    const pastelRainbow = [
        '#FFB3BA',
        '#FFDFBA',
        '#FFFFBA',
        '#BFFFBF',
        '#BAE1FF',
        '#FFBAF3',
    ];
    $('#tokenized_chunks_display').empty();
    for (let i = 0; i < chunks.length; i++) {
        let chunk = chunks[i].replace(/▁/g, ' '); // This is a leading space in sentencepiece. More info: Lower one eighth block (U+2581)
        // If <0xHEX>, decode it
        if (/^<0x[0-9A-F]+>$/i.test(chunk)) {
            const code = parseInt(chunk.substring(3, chunk.length - 1), 16);
            chunk = String.fromCodePoint(code);
        }
        // If newline - insert a line break
        if (chunk === '\n') {
            $('#tokenized_chunks_display').append('<br>');
            continue;
        }
        const color = pastelRainbow[i % pastelRainbow.length];
        const chunkHtml = $(`<code style="background-color: ${color};">${chunk}</code>`);
        chunkHtml.attr('title', ids[i]);
        $('#tokenized_chunks_display').append(chunkHtml);
    }
 }
 function doCount() {
    // get all of the messages in the chat
    const context = getContext();
--- a/public/scripts/extensions/token-counter/style.css
+++ b/public/scripts/extensions/token-counter/style.css
@@ -0,0 +1,4 @@
 #tokenized_chunks_display > code {
    color: black;
    text-shadow: none;
 }
--- a/public/scripts/tokenizers.js
+++ b/public/scripts/tokenizers.js
@@ -385,6 +385,11 @@ function getTextTokensRemote(endpoint, str, model = '') {
        contentType: "application/json",
        success: function (data) {
            ids = data.ids;
            // Don't want to break reverse compatibility, so sprinkle in some of the JS magic
            if (Array.isArray(data.chunks)) {
                Object.defineProperty(ids, 'chunks', { value: data.chunks });
            }
        }
    });
    return ids;
--- a/src/tokenizers.js
+++ b/src/tokenizers.js
@@ -78,6 +78,39 @@ async function countSentencepieceTokens(spp, text) {
    };
 }
 async function countSentencepieceArrayTokens(tokenizer, array) {
    const jsonBody = array.flatMap(x => Object.values(x)).join('\n\n');
    const result = await countSentencepieceTokens(tokenizer, jsonBody);
    const num_tokens = result.count;
    return num_tokens;
 }
 async function getTiktokenChunks(tokenizer, ids) {
    const decoder = new TextDecoder();
    const chunks = [];
    for (let i = 0; i < ids.length; i++) {
        const id = ids[i];
        const chunkTextBytes = await tokenizer.decode(new Uint32Array([id]));
        const chunkText = decoder.decode(chunkTextBytes);
        chunks.push(chunkText);
    }
    return chunks;
 }
 async function getWebTokenizersChunks(tokenizer, ids) {
    const chunks = [];
    for (let i = 0; i < ids.length; i++) {
        const id = ids[i];
        const chunkText = await tokenizer.decode(new Uint32Array([id]));
        chunks.push(chunkText);
    }
    return chunks;
 }
 /**
 * Gets the tokenizer model by the model name.
 * @param {string} requestModel Models to use for tokenization
@@ -169,10 +202,11 @@ function createSentencepieceEncodingHandler(getTokenizerFn) {
            const text = request.body.text || '';
            const tokenizer = getTokenizerFn();
            const { ids, count } = await countSentencepieceTokens(tokenizer, text);
-            return response.send({ ids, count });
+            const chunks = await tokenizer.encodePieces(text);
            return response.send({ ids, count, chunks });
        } catch (error) {
            console.log(error);
-            return response.send({ ids: [], count: 0 });
+            return response.send({ ids: [], count: 0, chunks: [] });
        }
    };
 }
@@ -215,10 +249,11 @@ function createTiktokenEncodingHandler(modelId) {
            const text = request.body.text || '';
            const tokenizer = getTiktokenTokenizer(modelId);
            const tokens = Object.values(tokenizer.encode(text));
-            return response.send({ ids: tokens, count: tokens.length });
+            const chunks = await getTiktokenChunks(tokenizer, tokens);
            return response.send({ ids: tokens, count: tokens.length, chunks });
        } catch (error) {
            console.log(error);
-            return response.send({ ids: [], count: 0 });
+            return response.send({ ids: [], count: 0, chunks: [] });
        }
    }
 }
@@ -317,7 +352,8 @@ function registerEndpoints(app, jsonParser) {
            if (queryModel.includes('claude')) {
                const text = req.body.text || '';
                const tokens = Object.values(claude_tokenizer.encode(text));
-                return res.send({ ids: tokens, count: tokens.length });
+                const chunks = await getWebTokenizersChunks(claude_tokenizer, tokens);
                return res.send({ ids: tokens, count: tokens.length, chunks });
            }
            const model = getTokenizerModel(queryModel);
@@ -325,7 +361,7 @@ function registerEndpoints(app, jsonParser) {
            return handler(req, res);
        } catch (error) {
            console.log(error);
-            return res.send({ ids: [], count: 0 });
+            return res.send({ ids: [], count: 0, chunks: [] });
        }
    });
@@ -343,16 +379,12 @@ function registerEndpoints(app, jsonParser) {
            }
            if (model == 'llama') {
-                const jsonBody = req.body.flatMap(x => Object.values(x)).join('\n\n');
+                num_tokens = await countSentencepieceArrayTokens(spp_llama, req.body);
                const llamaResult = await countSentencepieceTokens(spp_llama, jsonBody);
                num_tokens = llamaResult.count;
                return res.send({ "token_count": num_tokens });
            }
            if (model == 'mistral') {
-                const jsonBody = req.body.flatMap(x => Object.values(x)).join('\n\n');
+                num_tokens = await countSentencepieceArrayTokens(spp_mistral, req.body);
                const mistralResult = await countSentencepieceTokens(spp_mistral, jsonBody);
                num_tokens = mistralResult.count;
                return res.send({ "token_count": num_tokens });
            }
@@ -407,3 +439,4 @@ module.exports = {
    loadTokenizers,
    registerEndpoints,
 }