Update GPT tokenizer

Fixes being unable to tokenize words like constructor/toString/etc
This commit is contained in:
SillyLossy
2023-02-23 15:07:05 +02:00
parent e13c1a3877
commit 5bea179f3b
2 changed files with 5 additions and 5 deletions

View File

@@ -5,20 +5,20 @@ GPT-2/3 byte pair encoder/decoder/tokenizer based on [@latitudegames/GPT-3-Encod
See also: [JS byte pair encoder for OpenAI's CLIP model](https://github.com/josephrocca/clip-bpe-js). See also: [JS byte pair encoder for OpenAI's CLIP model](https://github.com/josephrocca/clip-bpe-js).
```js ```js
import {encode, decode} from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js"; import {encode, decode} from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js";
let text = "hello world"; let text = "hello world";
console.log(encode(text)); // [258, 18798, 995] console.log(encode(text)); // [258, 18798, 995]
console.log(decode(encode(text))); // "hello world" console.log(decode(encode(text))); // "hello world"
``` ```
or: or:
```js ```js
let mod = await import("https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js"); let mod = await import("https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js");
mod.encode("hello world"); // [258, 18798, 995] mod.encode("hello world"); // [258, 18798, 995]
``` ```
or to include it as a global variable in the browser: or to include it as a global variable in the browser:
```html ```html
<script type=module> <script type=module>
import tokenizer from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.1/mod.js"; import tokenizer from "https://deno.land/x/gpt_2_3_tokenizer@v0.0.2/mod.js";
window.tokenizer = tokenizer; window.tokenizer = tokenizer;
</script> </script>
``` ```

View File

@@ -81,7 +81,7 @@ const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
const cache = {} const cache = {}
function bpe(token) { function bpe(token) {
if (token in cache) { if (Object.hasOwn(cache, token)) {
return cache[token] return cache[token]
} }
@@ -107,7 +107,7 @@ function bpe(token) {
} }
))] ))]
if (!(bigram in bpe_ranks)) { if (!(Object.hasOwn(bpe_ranks, bigram))) {
break break
} }