correctly interpret some alternate whitespaces in token names

2025-06-05 21:59:27 +02:00 · 2024-10-19 00:24:35 -04:00
parent 1ac6780e9c
commit 5d5e552cbd
2 changed files with 5 additions and 5 deletions
--- a/public/scripts/extensions/token-counter/index.js
+++ b/public/scripts/extensions/token-counter/index.js
@@ -89,7 +89,7 @@ function drawChunks(chunks, ids) {
    $('#tokenized_chunks_display').empty();

    for (let i = 0; i < chunks.length; i++) {
-        let chunk = chunks[i].replace(/▁/g, ' '); // This is a leading space in sentencepiece. More info: Lower one eighth block (U+2581)
+        let chunk = chunks[i].replace(/[▁Ġ]/g, ' '); // This is a leading space in sentencepiece. More info: Lower one eighth block (U+2581)

        // If <0xHEX>, decode it
        if (/^<0x[0-9A-F]+>$/i.test(chunk)) {
--- a/public/scripts/logprobs.js
+++ b/public/scripts/logprobs.js
@@ -160,7 +160,7 @@ function renderTopLogprobs() {
    let matched = false;
    for (const [token, probability, log] of candidates) {
        const container = $('<button class="flex-container flexFlowColumn logprobs_top_candidate"></button>');
-        const tokenNormalized = String(token).replace(/^▁/g, ' ');
+        const tokenNormalized = String(token).replace(/^[▁Ġ]/g, ' ');

        if (token === selectedToken || tokenNormalized === selectedToken) {
            matched = true;
@@ -230,7 +230,7 @@ function onAlternativeClicked(tokenLogprobs, alternative) {
    const replaceIndex = messageLogprobs.findIndex(x => x === tokenLogprobs);

    const tokens = messageLogprobs.slice(0, replaceIndex + 1).map(({ token }) => token);
-    tokens[replaceIndex] = String(alternative).replace(/^▁/g, ' ');
+    tokens[replaceIndex] = String(alternative).replace(/^[▁Ġ]/g, ' ').replace(/Ċ/g, '\n');

    const prefix = continueFrom || '';
    const prompt = prefix + tokens.join('');
@@ -343,7 +343,7 @@ function createSwipe(messageId, prompt) {
 * @returns {string}
 */
 function toVisibleWhitespace(input) {
-    return input.replace(/ /g, '·').replace(/▁/g, '·').replace(/\n/g, '↵');
+    return input.replace(/ /g, '·').replace(/[▁Ġ]/g, '·').replace(/[Ċ\n]/g, '↵');
 }

 /**
@@ -362,7 +362,7 @@ function withVirtualWhitespace(text, span) {
    if (text.match(/\s$/)) {
        result.push($(document.createTextNode('\u200b')));
    }
-    if (text.match(/^▁/)) {
+    if (text.match(/^[▁Ġ]/)) {
        result.unshift(document.createTextNode('\u200b'));
    }
    // line breaks are trickier. we don't currently handle consecutive line