mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Update GPT tokenizer
Fixes being unable to tokenize words like constructor/toString/etc
This commit is contained in:
@@ -5,20 +5,20 @@ GPT-2/3 byte pair encoder/decoder/tokenizer based on [@latitudegames/GPT-3-Encod
|
|||||||
See also: [JS byte pair encoder for OpenAI's CLIP model](https://github.com/josephrocca/clip-bpe-js).
|
See also: [JS byte pair encoder for OpenAI's CLIP model](https://github.com/josephrocca/clip-bpe-js).
|
||||||
|
|
||||||
```js
|
```js
|
||||||
import {encode, decode} from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js";
|
import {encode, decode} from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js";
|
||||||
let text = "hello world";
|
let text = "hello world";
|
||||||
console.log(encode(text)); // [258, 18798, 995]
|
console.log(encode(text)); // [258, 18798, 995]
|
||||||
console.log(decode(encode(text))); // "hello world"
|
console.log(decode(encode(text))); // "hello world"
|
||||||
```
|
```
|
||||||
or:
|
or:
|
||||||
```js
|
```js
|
||||||
let mod = await import("https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js");
|
let mod = await import("https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js");
|
||||||
mod.encode("hello world"); // [258, 18798, 995]
|
mod.encode("hello world"); // [258, 18798, 995]
|
||||||
```
|
```
|
||||||
or to include it as a global variable in the browser:
|
or to include it as a global variable in the browser:
|
||||||
```html
|
```html
|
||||||
<script type=module>
|
<script type=module>
|
||||||
import tokenizer from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js";
|
import tokenizer from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js";
|
||||||
window.tokenizer = tokenizer;
|
window.tokenizer = tokenizer;
|
||||||
</script>
|
</script>
|
||||||
```
|
```
|
||||||
|
@@ -81,7 +81,7 @@ const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
|
|||||||
const cache = {}
|
const cache = {}
|
||||||
|
|
||||||
function bpe(token) {
|
function bpe(token) {
|
||||||
if (token in cache) {
|
if (Object.hasOwn(cache, token)) {
|
||||||
return cache[token]
|
return cache[token]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,7 +107,7 @@ function bpe(token) {
|
|||||||
}
|
}
|
||||||
))]
|
))]
|
||||||
|
|
||||||
if (!(bigram in bpe_ranks)) {
|
if (!(Object.hasOwn(bpe_ranks, bigram))) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user