Remove "remote" language from tokenizer functions

We'll be making a distinction between tokenizing *on* the server itself,
and tokenizing via the server having the AI service do it. It makes more
sense to use the term "remote" for the latter.
This commit is contained in:
valadaptive
2023-12-09 19:43:33 -05:00
parent 04e92efe29
commit ddd73a204a

View File

@@ -161,19 +161,19 @@ function callTokenizer(type, str, padding) {
case tokenizers.NONE: case tokenizers.NONE:
return guesstimate(str) + padding; return guesstimate(str) + padding;
case tokenizers.GPT2: case tokenizers.GPT2:
return countTokensRemote('/api/tokenizers/gpt2/encode', str, padding); return countTokensFromServer('/api/tokenizers/gpt2/encode', str, padding);
case tokenizers.LLAMA: case tokenizers.LLAMA:
return countTokensRemote('/api/tokenizers/llama/encode', str, padding); return countTokensFromServer('/api/tokenizers/llama/encode', str, padding);
case tokenizers.NERD: case tokenizers.NERD:
return countTokensRemote('/api/tokenizers/nerdstash/encode', str, padding); return countTokensFromServer('/api/tokenizers/nerdstash/encode', str, padding);
case tokenizers.NERD2: case tokenizers.NERD2:
return countTokensRemote('/api/tokenizers/nerdstash_v2/encode', str, padding); return countTokensFromServer('/api/tokenizers/nerdstash_v2/encode', str, padding);
case tokenizers.MISTRAL: case tokenizers.MISTRAL:
return countTokensRemote('/api/tokenizers/mistral/encode', str, padding); return countTokensFromServer('/api/tokenizers/mistral/encode', str, padding);
case tokenizers.YI: case tokenizers.YI:
return countTokensRemote('/api/tokenizers/yi/encode', str, padding); return countTokensFromServer('/api/tokenizers/yi/encode', str, padding);
case tokenizers.API: case tokenizers.API:
return countTokensRemote('/api/tokenizers/remote/encode', str, padding); return countTokensFromServer('/api/tokenizers/remote/encode', str, padding);
default: default:
console.warn('Unknown tokenizer type', type); console.warn('Unknown tokenizer type', type);
return callTokenizer(tokenizers.NONE, str, padding); return callTokenizer(tokenizers.NONE, str, padding);
@@ -391,7 +391,7 @@ function getTokenCacheObject() {
return tokenCache[String(chatId)]; return tokenCache[String(chatId)];
} }
function getRemoteTokenizationParams(str) { function getServerTokenizationParams(str) {
return { return {
text: str, text: str,
main_api, main_api,
@@ -404,20 +404,20 @@ function getRemoteTokenizationParams(str) {
} }
/** /**
* Counts token using the remote server API. * Counts token using the server API.
* @param {string} endpoint API endpoint. * @param {string} endpoint API endpoint.
* @param {string} str String to tokenize. * @param {string} str String to tokenize.
* @param {number} padding Number of padding tokens. * @param {number} padding Number of padding tokens.
* @returns {number} Token count with padding. * @returns {number} Token count with padding.
*/ */
function countTokensRemote(endpoint, str, padding) { function countTokensFromServer(endpoint, str, padding) {
let tokenCount = 0; let tokenCount = 0;
jQuery.ajax({ jQuery.ajax({
async: false, async: false,
type: 'POST', type: 'POST',
url: endpoint, url: endpoint,
data: JSON.stringify(getRemoteTokenizationParams(str)), data: JSON.stringify(getServerTokenizationParams(str)),
dataType: 'json', dataType: 'json',
contentType: 'application/json', contentType: 'application/json',
success: function (data) { success: function (data) {
@@ -450,7 +450,7 @@ function countTokensRemote(endpoint, str, padding) {
* @param {string} model Tokenizer model. * @param {string} model Tokenizer model.
* @returns {number[]} Array of token ids. * @returns {number[]} Array of token ids.
*/ */
function getTextTokensRemote(endpoint, str, model = '') { function getTextTokensFromServer(endpoint, str, model = '') {
if (model) { if (model) {
endpoint += `?model=${model}`; endpoint += `?model=${model}`;
} }
@@ -460,7 +460,7 @@ function getTextTokensRemote(endpoint, str, model = '') {
async: false, async: false,
type: 'POST', type: 'POST',
url: endpoint, url: endpoint,
data: JSON.stringify(getRemoteTokenizationParams(str)), data: JSON.stringify(getServerTokenizationParams(str)),
dataType: 'json', dataType: 'json',
contentType: 'application/json', contentType: 'application/json',
success: function (data) { success: function (data) {
@@ -480,7 +480,7 @@ function getTextTokensRemote(endpoint, str, model = '') {
* @param {string} endpoint API endpoint. * @param {string} endpoint API endpoint.
* @param {number[]} ids Array of token ids * @param {number[]} ids Array of token ids
*/ */
function decodeTextTokensRemote(endpoint, ids, model = '') { function decodeTextTokensFromServer(endpoint, ids, model = '') {
if (model) { if (model) {
endpoint += `?model=${model}`; endpoint += `?model=${model}`;
} }
@@ -501,7 +501,7 @@ function decodeTextTokensRemote(endpoint, ids, model = '') {
} }
/** /**
* Encodes a string to tokens using the remote server API. * Encodes a string to tokens using the server API.
* @param {number} tokenizerType Tokenizer type. * @param {number} tokenizerType Tokenizer type.
* @param {string} str String to tokenize. * @param {string} str String to tokenize.
* @returns {number[]} Array of token ids. * @returns {number[]} Array of token ids.
@@ -509,23 +509,23 @@ function decodeTextTokensRemote(endpoint, ids, model = '') {
export function getTextTokens(tokenizerType, str) { export function getTextTokens(tokenizerType, str) {
switch (tokenizerType) { switch (tokenizerType) {
case tokenizers.GPT2: case tokenizers.GPT2:
return getTextTokensRemote('/api/tokenizers/gpt2/encode', str); return getTextTokensFromServer('/api/tokenizers/gpt2/encode', str);
case tokenizers.LLAMA: case tokenizers.LLAMA:
return getTextTokensRemote('/api/tokenizers/llama/encode', str); return getTextTokensFromServer('/api/tokenizers/llama/encode', str);
case tokenizers.NERD: case tokenizers.NERD:
return getTextTokensRemote('/api/tokenizers/nerdstash/encode', str); return getTextTokensFromServer('/api/tokenizers/nerdstash/encode', str);
case tokenizers.NERD2: case tokenizers.NERD2:
return getTextTokensRemote('/api/tokenizers/nerdstash_v2/encode', str); return getTextTokensFromServer('/api/tokenizers/nerdstash_v2/encode', str);
case tokenizers.MISTRAL: case tokenizers.MISTRAL:
return getTextTokensRemote('/api/tokenizers/mistral/encode', str); return getTextTokensFromServer('/api/tokenizers/mistral/encode', str);
case tokenizers.YI: case tokenizers.YI:
return getTextTokensRemote('/api/tokenizers/yi/encode', str); return getTextTokensFromServer('/api/tokenizers/yi/encode', str);
case tokenizers.OPENAI: { case tokenizers.OPENAI: {
const model = getTokenizerModel(); const model = getTokenizerModel();
return getTextTokensRemote('/api/tokenizers/openai/encode', str, model); return getTextTokensFromServer('/api/tokenizers/openai/encode', str, model);
} }
case tokenizers.API: case tokenizers.API:
return getTextTokensRemote('/api/tokenizers/remote/encode', str); return getTextTokensFromServer('/api/tokenizers/remote/encode', str);
default: default:
console.warn('Calling getTextTokens with unsupported tokenizer type', tokenizerType); console.warn('Calling getTextTokens with unsupported tokenizer type', tokenizerType);
return []; return [];
@@ -533,27 +533,27 @@ export function getTextTokens(tokenizerType, str) {
} }
/** /**
* Decodes token ids to text using the remote server API. * Decodes token ids to text using the server API.
* @param {number} tokenizerType Tokenizer type. * @param {number} tokenizerType Tokenizer type.
* @param {number[]} ids Array of token ids * @param {number[]} ids Array of token ids
*/ */
export function decodeTextTokens(tokenizerType, ids) { export function decodeTextTokens(tokenizerType, ids) {
switch (tokenizerType) { switch (tokenizerType) {
case tokenizers.GPT2: case tokenizers.GPT2:
return decodeTextTokensRemote('/api/tokenizers/gpt2/decode', ids); return decodeTextTokensFromServer('/api/tokenizers/gpt2/decode', ids);
case tokenizers.LLAMA: case tokenizers.LLAMA:
return decodeTextTokensRemote('/api/tokenizers/llama/decode', ids); return decodeTextTokensFromServer('/api/tokenizers/llama/decode', ids);
case tokenizers.NERD: case tokenizers.NERD:
return decodeTextTokensRemote('/api/tokenizers/nerdstash/decode', ids); return decodeTextTokensFromServer('/api/tokenizers/nerdstash/decode', ids);
case tokenizers.NERD2: case tokenizers.NERD2:
return decodeTextTokensRemote('/api/tokenizers/nerdstash_v2/decode', ids); return decodeTextTokensFromServer('/api/tokenizers/nerdstash_v2/decode', ids);
case tokenizers.MISTRAL: case tokenizers.MISTRAL:
return decodeTextTokensRemote('/api/tokenizers/mistral/decode', ids); return decodeTextTokensFromServer('/api/tokenizers/mistral/decode', ids);
case tokenizers.YI: case tokenizers.YI:
return decodeTextTokensRemote('/api/tokenizers/yi/decode', ids); return decodeTextTokensFromServer('/api/tokenizers/yi/decode', ids);
case tokenizers.OPENAI: { case tokenizers.OPENAI: {
const model = getTokenizerModel(); const model = getTokenizerModel();
return decodeTextTokensRemote('/api/tokenizers/openai/decode', ids, model); return decodeTextTokensFromServer('/api/tokenizers/openai/decode', ids, model);
} }
default: default:
console.warn('Calling decodeTextTokens with unsupported tokenizer type', tokenizerType); console.warn('Calling decodeTextTokens with unsupported tokenizer type', tokenizerType);