correctly interpret some alternate whitespaces in token names

This commit is contained in:
50h100a 2024-10-19 00:24:35 -04:00
parent 1ac6780e9c
commit 5d5e552cbd
2 changed files with 5 additions and 5 deletions

View File

@ -89,7 +89,7 @@ function drawChunks(chunks, ids) {
$('#tokenized_chunks_display').empty();
for (let i = 0; i < chunks.length; i++) {
let chunk = chunks[i].replace(/▁/g, ' '); // This is a leading space in sentencepiece. More info: Lower one eighth block (U+2581)
let chunk = chunks[i].replace(/[Ġ]/g, ' '); // This is a leading space in sentencepiece. More info: Lower one eighth block (U+2581)
// If <0xHEX>, decode it
if (/^<0x[0-9A-F]+>$/i.test(chunk)) {

View File

@ -160,7 +160,7 @@ function renderTopLogprobs() {
let matched = false;
for (const [token, probability, log] of candidates) {
const container = $('<button class="flex-container flexFlowColumn logprobs_top_candidate"></button>');
const tokenNormalized = String(token).replace(/^▁/g, ' ');
const tokenNormalized = String(token).replace(/^[Ġ]/g, ' ');
if (token === selectedToken || tokenNormalized === selectedToken) {
matched = true;
@ -230,7 +230,7 @@ function onAlternativeClicked(tokenLogprobs, alternative) {
const replaceIndex = messageLogprobs.findIndex(x => x === tokenLogprobs);
const tokens = messageLogprobs.slice(0, replaceIndex + 1).map(({ token }) => token);
tokens[replaceIndex] = String(alternative).replace(/^▁/g, ' ');
tokens[replaceIndex] = String(alternative).replace(/^[Ġ]/g, ' ').replace(/Ċ/g, '\n');
const prefix = continueFrom || '';
const prompt = prefix + tokens.join('');
@ -343,7 +343,7 @@ function createSwipe(messageId, prompt) {
* @returns {string}
*/
function toVisibleWhitespace(input) {
return input.replace(/ /g, '·').replace(/▁/g, '·').replace(/\n/g, '↵');
return input.replace(/ /g, '·').replace(/[Ġ]/g, '·').replace(/\n]/g, '↵');
}
/**
@ -362,7 +362,7 @@ function withVirtualWhitespace(text, span) {
if (text.match(/\s$/)) {
result.push($(document.createTextNode('\u200b')));
}
if (text.match(/^▁/)) {
if (text.match(/^[Ġ]/)) {
result.unshift(document.createTextNode('\u200b'));
}
// line breaks are trickier. we don't currently handle consecutive line