mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Update GPT tokenizer
Fixes being unable to tokenize words like constructor/toString/etc
This commit is contained in:
@ -5,20 +5,20 @@ GPT-2/3 byte pair encoder/decoder/tokenizer based on [@latitudegames/GPT-3-Encod
|
||||
See also: [JS byte pair encoder for OpenAI's CLIP model](https://github.com/josephrocca/clip-bpe-js).
|
||||
|
||||
```js
|
||||
import {encode, decode} from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js";
|
||||
import {encode, decode} from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js";
|
||||
let text = "hello world";
|
||||
console.log(encode(text)); // [258, 18798, 995]
|
||||
console.log(decode(encode(text))); // "hello world"
|
||||
```
|
||||
or:
|
||||
```js
|
||||
let mod = await import("https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js");
|
||||
let mod = await import("https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js");
|
||||
mod.encode("hello world"); // [258, 18798, 995]
|
||||
```
|
||||
or to include it as a global variable in the browser:
|
||||
```html
|
||||
<script type=module>
|
||||
import tokenizer from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js";
|
||||
import tokenizer from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js";
|
||||
window.tokenizer = tokenizer;
|
||||
</script>
|
||||
```
|
||||
|
@ -81,7 +81,7 @@ const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
|
||||
const cache = {}
|
||||
|
||||
function bpe(token) {
|
||||
if (token in cache) {
|
||||
if (Object.hasOwn(cache, token)) {
|
||||
return cache[token]
|
||||
}
|
||||
|
||||
@ -107,7 +107,7 @@ function bpe(token) {
|
||||
}
|
||||
))]
|
||||
|
||||
if (!(bigram in bpe_ranks)) {
|
||||
if (!(Object.hasOwn(bpe_ranks, bigram))) {
|
||||
break
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user