SillyTavern/public/scripts/gpt-3-tokenizer/gpt3-tokenizer.js

271 lines
1.6 MiB
JavaScript
Raw Normal View History

2023-07-20 19:32:15 +02:00
import { ArrayKeyedMap } from './array-keyed-map.js';
function _inheritsLoose(subClass, superClass) {
subClass.prototype = Object.create(superClass.prototype);
subClass.prototype.constructor = subClass;
_setPrototypeOf(subClass, superClass);
}
function _setPrototypeOf(o, p) {
_setPrototypeOf = Object.setPrototypeOf ? Object.setPrototypeOf.bind() : function _setPrototypeOf(o, p) {
o.__proto__ = p;
return o;
};
return _setPrototypeOf(o, p);
}
function _unsupportedIterableToArray(o, minLen) {
if (!o) return;
if (typeof o === "string") return _arrayLikeToArray(o, minLen);
var n = Object.prototype.toString.call(o).slice(8, -1);
if (n === "Object" && o.constructor) n = o.constructor.name;
if (n === "Map" || n === "Set") return Array.from(o);
if (n === "Arguments" || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n)) return _arrayLikeToArray(o, minLen);
}
function _arrayLikeToArray(arr, len) {
if (len == null || len > arr.length) len = arr.length;
for (var i = 0, arr2 = new Array(len); i < len; i++) arr2[i] = arr[i];
return arr2;
}
function _createForOfIteratorHelperLoose(o, allowArrayLike) {
var it = typeof Symbol !== "undefined" && o[Symbol.iterator] || o["@@iterator"];
if (it) return (it = it.call(o)).next.bind(it);
if (Array.isArray(o) || (it = _unsupportedIterableToArray(o)) || allowArrayLike && o && typeof o.length === "number") {
if (it) o = it;
var i = 0;
return function () {
if (i >= o.length) return {
done: true
};
return {
done: false,
value: o[i++]
};
};
}
throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.");
}
// import { TextEncoder } from 'util';
if (typeof TextEncoder === 'undefined') {
throw new Error('TextEncoder is required for this module to work in the browser');
}
// import { TextDecoder } from 'util';
if (typeof TextDecoder === 'undefined') {
throw new Error('TextDecoder is required for this module to work in the browser');
}
// This is grabbed from source code of https://beta.openai.com/tokenizer?view=bpe
var bpeVocab="#version: 0.2\n\u0120 t\n\u0120 a\nh e\ni n\nr e\no n\n\u0120t he\ne r\n\u0120 s\na t\n\u0120 w\n\u0120 o\ne n\n\u0120 c\ni t\ni s\na n\no r\ne s\n\u0120 b\ne d\n\u0120 f\nin g\n\u0120 p\no u\n\u0120a n\na l\na r\n\u0120t o\n\u0120 m\n\u0120o f\n\u0120 in\n\u0120 d\n\u0120 h\n\u0120an d\ni c\na s\nl e\n\u0120t h\ni on\no m\nl l\nen t\n\u0120 n\n\u0120 l\ns t\n\u0120 re\nv e\n\u0120 e\nr o\nl y\n\u0120b e\n\u0120 g\n\u0120 T\nc t\n\u0120 S\ni d\no t\n\u0120 I\nu t\ne t\n\u0120 A\n\u0120 is\n\u0120 on\ni m\na m\no w\na y\na d\ns e\n\u0120th at\n\u0120 C\ni g\n\u0120f or\na c\n\u0120 y\nv er\nu r\n\u0120 u\nl d\n\u0120s t\n\u0120 M\n' s\n\u0120 he\n\u0120 it\nat ion\nit h\ni r\nc e\n\u0120y ou\ni l\n\u0120 B\n\u0120w h\no l\n\u0120 P\n\u0120w ith\n\u0120 1\nt er\nc h\n\u0120a s\n\u0120w e\n\u0120 (\nn d\ni ll\n\u0120 D\ni f\n\u0120 2\na g\ner s\nk e\n\u0120 \"\n\u0120 H\ne m\n\u0120c on\n\u0120 W\n\u0120 R\nhe r\n\u0120w as\n\u0120 r\no d\n\u0120 F\nu l\nat e\n\u0120a t\nr i\np p\no re\n\u0120T he\n\u0120s e\nu s\n\u0120p ro\n\u0120h a\nu m\n\u0120a re\n\u0120d e\na in\nan d\n\u0120o r\nig h\nes t\nis t\na b\nr om\n\u0120 N\nt h\n\u0120c om\n\u0120 G\nu n\no p\n0 0\n\u0120 L\n\u0120n ot\nes s\n\u0120e x\n\u0120 v\nre s\n\u0120 E\ne w\nit y\nan t\n\u0120b y\ne l\no s\nor t\no c\nq u\n\u0120f rom\n\u0120ha ve\n\u0120s u\ni ve\nou ld\n\u0120s h\n\u0120th is\nn t\nr a\np e\nigh t\nar t\nm ent\n\u0120a l\nu st\nen d\n- -\nal l\n\u0120 O\nac k\n\u0120c h\n\u0120 le\ni es\nre d\nar d\n\xE2 \u0122\nou t\n\u0120 J\n\u0120a b\ne ar\ni v\nal ly\nou r\no st\ng h\np t\n\u0120p l\nas t\n\u0120c an\na k\nom e\nu d\nT he\n\u0120h is\n\u0120d o\n\u0120g o\n\u0120h as\ng e\n' t\n\u0120 U\nr ou\n\u0120s a\n\u0120 j\n\u0120b ut\n\u0120w or\n\u0120a ll\ne ct\n\u0120 k\nam e\n\u0120w ill\no k\n\u0120w he\n\u0120the y\nid e\n0 1\nf f\nic h\np l\nt her\n\u0120t r\n. .\n\u0120in t\ni e\nu re\nag e\n\u0120n e\ni al\na p\nin e\nic e\n\u0120m e\n\u0120o ut\nan s\non e\non g\nion s\n\u0120wh o\n\u0120 K\n\u0120u p\n\u0120the ir\n\u0120a d\n\u0120 3\n\u0120u s\nat ed\nou s\n\u0120m ore\nu e\no g\n\u0120S t\nin d\ni ke\n\u0120s o\nim e\np er\n. \"\nb er\ni z\na ct\n\u0120on e\n\u0120sa id\n\u0120 -\na re\n\u0120you r\nc c\n\u0120T h\n\u0120c l\ne p\na ke\nab le\ni p\n\u0120con t\n\u0120wh ich\ni a\n\u0120 im\n\u0120ab out\n\u0120we re\nver y\nu b\n\u0120h ad\n\u0120 en\n\u0120com p\n, \"\n\u0120I n\n\u0120u n\n\u0120a g\ni re\nac e\na u\nar y\n\u0120w ould\nas s\nr y\n\u0120 \xE2\u0122\nc l\no ok\ne re\ns o\n\u0120 V\nig n\ni b\n\u0120of f\n\u0120t e\nv en\n\u0120 Y\ni le\no se\nit e\nor m\n\u01202 01\n\u0120re s\n\u0120m an\n\u0120p er\n\u0120o ther\nor d\nul t\n\u0120be en\n\u0120l ike\nas e\nan ce\nk s\nay s\now n\nen ce\n\u0120d is\nct ion\n\u0120an y\n\u0120a pp\n\u0120s p\nin t\nres s\nation s\na il\n\u0120 4\nic al\n\u0120the m\n\u0120he r\nou nt\n\u0120C h\n\u0120a r\n\u0120 if\n\u0120the re\n\u0120p e\n\u0120y ear\na v\n\u0120m y\n\u0120s ome\n\u0120whe n\nou gh\nac h\n\u0120th an\nr u\non d\nic k\n\u0120o ver\nve l\n\u0120 qu\n\u010A \u010A\n\u0120s c\nre at\nre e\n\u0120I t\nou nd\np ort\n\u0120al so\n\u0120p art\nf ter\n\u0120k n\n\u0120be c\n\u0120t ime\nen s\n\u0120 5\nop le\n\u0120wh at\n\u0120n o\nd u\nm er\nan g\n\u0120n ew\n-- --\n\u0120g et\nor y\nit ion\ning s\n\u0120j ust\n\u0120int o\n\u0120 0\nent s\no ve\nt e\n\u0120pe ople\n\u0120p re\n\u0120it s\n\u0120re c\n\u0120t w\ni an\nir st\nar k\nor s\n\u0120wor k\nad e\no b\n\u0120s he\n\u0120o ur\nw n\nin k\nl ic\n\u01201 9\n\u0120H e\nis h\nnd er\nau se\n\u0120h im\non s\n\u0120 [\n\u0120 ro\nf orm\ni ld\nat es\nver s\n\u0120on ly\no ll\n\u0120s pe\nc k\ne ll\nam p\n\u0120a cc\n\u0120b l\ni ous\nur n\nf t\no od\n\u0120h ow\nhe d\n\u0120 '\n\u0120a fter\na w\n\u0120at t\no v\nn e\n\u0120pl ay\ner v\nic t\n\u0120c ould\nit t\n\u0120a m\n\u0120f irst\n\u0120 6\n\u0120a ct\n\u0120 $\ne c\nh ing\nu al\nu ll\n\u0120com m\no y\no ld\nc es\nat er\n\u0120f e\n\u0120be t\nw e\nif f\n\u0120tw o\noc k\n\u0120b ack\n) .\nid ent\n\u0120u nder\nrou gh\nse l\nx t\n\u0120m ay\nrou nd\n\
// This is grabbed from source code of https://beta.openai.com/tokenizer?view=bpe
var bpeRegex = /'s|'t|'re|'ve|'m|'ll|'d| ?(?:[A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0560-\u0588\u05D0-\u05EA\u05EF-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u0860-\u086A\u08A0-\u08B4\u08B6-\u08C7\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u09FC\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D04-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E86-\u0E8A\u0E8C-\u0EA3\u0EA5\u0EA7-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1878\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1C80-\u1C88\u1C90-\u1CBA\u1CBD-\u1CBF\u1CE9-\u1CEC\u1CEE-\u1CF3\u1CF5\u1CF6\u1CFA\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312F\u3131-\u318E\u31A0-\u31BF\u31F0-\u31FF\u3400-\u4DBF\u4E00-\u9FFC\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA7BF\uA7C2-\uA7CA\uA7F5-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA8FE\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uA9E0-\uA9E4\uA9E6-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB69\uAB70-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\
// This is grabbed from source code of https://beta.openai.com/tokenizer?view=bpe
var encodings=/*#__PURE__*/JSON.parse("{\"0\":15,\"1\":16,\"2\":17,\"3\":18,\"4\":19,\"5\":20,\"6\":21,\"7\":22,\"8\":23,\"9\":24,\"10\":940,\"11\":1157,\"12\":1065,\"13\":1485,\"14\":1415,\"15\":1314,\"16\":1433,\"17\":1558,\"18\":1507,\"19\":1129,\"20\":1238,\"21\":2481,\"22\":1828,\"23\":1954,\"24\":1731,\"25\":1495,\"26\":2075,\"27\":1983,\"28\":2078,\"29\":1959,\"30\":1270,\"31\":3132,\"32\":2624,\"33\":2091,\"34\":2682,\"35\":2327,\"36\":2623,\"37\":2718,\"38\":2548,\"39\":2670,\"40\":1821,\"41\":3901,\"42\":3682,\"43\":3559,\"44\":2598,\"45\":2231,\"46\":3510,\"47\":2857,\"48\":2780,\"49\":2920,\"50\":1120,\"51\":4349,\"52\":4309,\"53\":4310,\"54\":4051,\"55\":2816,\"56\":3980,\"57\":3553,\"58\":3365,\"59\":3270,\"60\":1899,\"61\":5333,\"62\":5237,\"63\":5066,\"64\":2414,\"65\":2996,\"66\":2791,\"67\":3134,\"68\":3104,\"69\":3388,\"70\":2154,\"71\":4869,\"72\":4761,\"73\":4790,\"74\":4524,\"75\":2425,\"76\":4304,\"77\":3324,\"78\":3695,\"79\":3720,\"80\":1795,\"81\":6659,\"82\":6469,\"83\":5999,\"84\":5705,\"85\":5332,\"86\":4521,\"87\":5774,\"88\":3459,\"89\":4531,\"90\":3829,\"91\":6420,\"92\":5892,\"93\":6052,\"94\":5824,\"95\":3865,\"96\":4846,\"97\":5607,\"98\":4089,\"99\":2079,\"100\":3064,\"101\":8784,\"102\":15377,\"103\":15197,\"104\":13464,\"105\":13348,\"106\":15801,\"107\":15982,\"108\":15711,\"109\":14454,\"110\":11442,\"111\":16243,\"112\":14686,\"113\":16616,\"114\":16562,\"115\":15363,\"116\":18298,\"117\":17657,\"118\":16817,\"119\":16315,\"120\":10232,\"121\":19244,\"122\":18376,\"123\":10163,\"124\":17464,\"125\":11623,\"126\":19420,\"127\":16799,\"128\":12762,\"129\":18741,\"130\":12952,\"131\":22042,\"132\":19924,\"133\":16945,\"134\":19880,\"135\":17059,\"136\":20809,\"137\":19708,\"138\":20107,\"139\":20219,\"140\":15187,\"141\":23756,\"142\":23726,\"143\":21139,\"144\":18444,\"145\":18781,\"146\":20964,\"147\":20198,\"148\":18294,\"149\":19442,\"150\":8628,\"151\":24309,\"152\":17827,\"153\":21395,\"154\":21526,\"155\":18742,\"156\":21599,\"157\":18458,\"158\":21273,\"159\":19707,\"160\":14198,\"161\":25948,\"162\":25061,\"163\":24136,\"164\":23237,\"165\":20986,\"166\":23055,\"167\":21940,\"168\":14656,\"169\":22172,\"170\":17279,\"171\":27192,\"172\":23628,\"173\":25399,\"174\":22985,\"175\":17430,\"176\":24096,\"177\":22413,\"178\":23188,\"179\":21738,\"180\":15259,\"181\":27057,\"182\":24294,\"183\":24839,\"184\":22883,\"185\":21652,\"186\":25096,\"187\":23451,\"188\":20356,\"189\":23362,\"190\":19782,\"191\":26492,\"192\":17477,\"193\":24943,\"194\":22913,\"195\":22186,\"196\":25272,\"197\":24991,\"198\":22337,\"199\":19104,\"200\":2167,\"201\":1264,\"202\":19004,\"203\":22416,\"204\":18638,\"205\":21261,\"206\":22136,\"207\":22745,\"208\":21315,\"209\":22567,\"210\":21536,\"211\":21895,\"212\":21777,\"213\":26427,\"214\":22291,\"215\":23349,\"216\":20666,\"217\":24591,\"218\":28727,\"219\":28896,\"220\":17572,\"221\":26115,\"222\":23148,\"223\":22047,\"224\":24137,\"225\":18182,\"226\":24909,\"227\":24403,\"228\":23815,\"229\":23539,\"230\":19214,\"231\":25667,\"232\":24339,\"233\":25429,\"234\":24409,\"235\":22370,\"236\":24940,\"237\":24693,\"238\":23721,\"239\":23516,\"240\":16102,\"241\":28872,\"242\":27877,\"243\":26660,\"244\":25707,\"245\":22995,\"246\":26912,\"247\":23753,\"248\":23045,\"249\":21626,\"250\":9031,\"251\":28072,\"252\":22800,\"253\":28592,\"254\":24970,\"255\":13381,\"256\":11645,\"257\":28676,\"258\":25600,\"259\":25191,\"260\":21719,\"261\":30057,\"262\":29119,\"263\":29558,\"264\":18897,\"265\":22980,\"266\":25540,\"267\":25674,\"268\":25022,\"269\":26276,\"270\":20233,\"271\":28977,\"272\":29807,\"273\":27367,\"274\":28857,\"275\":23195,\"276\":27988,\"277\":27019,\"278\":25870,\"279\":26050,\"280\":21033,\"281\":30368,\"282\":32568,\"283\":30290,\"284\":30336,\"285\":26279,\"286\":27033,\"287\":27800,\"288\":25270,\"289\":27693,\"290\":24369,\"291\":33551,\"292\":32759,\"293\":31675,\"294\":27696,\"295\":25710,\"296\":27137,\"297\":26561,\"298\":27728,\"299\":22579,\"300\":6200,\"301\":18938,\"302\":22709,\"303\":22572,\"304\":21288,\"305\":22515,\"306\":2
var range = function range(x, y) {
var res = Array.from(Array(y).keys()).slice(x);
return res;
};
var ord = function ord(x) {
return x.charCodeAt(0);
};
var chr = function chr(n) {
return String.fromCharCode(n);
};
var GPT3Tokenizer = /*#__PURE__*/function () {
function GPT3Tokenizer(options) {
this.encodings = encodings;
this.vocab = bpeVocab;
this.nMergedSpaces = options.type === 'codex' ? 24 : 0;
this.nVocab = 50257 + this.nMergedSpaces;
this.decodings = {};
this.bpeRanks = new ArrayKeyedMap();
this.byteEncoder = new Map();
this.byteDecoder = new Map();
this.cache = {};
this.initialize();
}
var _proto = GPT3Tokenizer.prototype;
_proto.initialize = function initialize() {
var _this = this;
if (this.vocab.length < 100) {
throw new Error('Tokenizer vocab file did not load correctly');
}
var vocabLines = this.vocab.split('\n');
var bpeMerges = vocabLines.slice(1, vocabLines.length - 1).map(function (line) {
return line.split(/(\s+)/).filter(function (part) {
return part.trim().length > 0;
});
});
// add merged spaces for codex tokenizer
if (this.nMergedSpaces > 0) {
for (var i = 1; i < this.nMergedSpaces; i++) {
for (var j = 1; j < this.nMergedSpaces; j++) {
if (i + j <= this.nMergedSpaces) {
bpeMerges.push(["\u0120".repeat(i), "\u0120".repeat(j)]);
}
}
}
for (var _i = 0; _i < this.nMergedSpaces; _i++) {
this.encodings["\u0120".repeat(_i + 2)] = this.nVocab - this.nMergedSpaces + _i;
}
}
for (var _i2 = 0, _Object$keys = Object.keys(this.encodings); _i2 < _Object$keys.length; _i2++) {
var key = _Object$keys[_i2];
this.decodings[this.encodings[key]] = key;
}
this.byteEncoder = this.bytesToUnicode();
this.byteEncoder.forEach(function (value, key) {
_this.byteDecoder.set(value, key);
});
this.zip(this.bpeRanks, bpeMerges, range(0, bpeMerges.length));
};
_proto.zip = function zip(result, x, y) {
x.forEach(function (_, idx) {
result.set(x[idx], y[idx]);
});
return result;
};
_proto.bytesToUnicode = function bytesToUnicode() {
var bs = range(ord('!'), ord('~') + 1).concat(range(ord('\xa1'), ord('\xac') + 1), range(ord('\xae'), ord('\xff') + 1));
var cs = bs.slice();
var n = 0;
for (var b = 0; b < Math.pow(2, 8); b++) {
if (!bs.includes(b)) {
bs.push(b);
cs.push(Math.pow(2, 8) + n);
n = n + 1;
}
}
cs = cs.map(function (c) {
return chr(c);
});
var result = new Map();
this.zip(result, bs, cs);
return result;
};
_proto.getPairs = function getPairs(word) {
var pairs = new Set();
var prevChar = word[0];
for (var i = 1; i < word.length; i++) {
var _char = word[i];
pairs.add([prevChar, _char]);
prevChar = _char;
}
return pairs;
};
_proto.bpe = function bpe(token) {
if (Object.prototype.hasOwnProperty.call(this.cache, token)) {
return this.cache[token];
}
var word = token.split('');
var pairs = this.getPairs(word);
if (!pairs || pairs.size === 0) {
return token;
}
while (true) {
var minPairs = {};
for (var _i3 = 0, _Array$from = Array.from(pairs); _i3 < _Array$from.length; _i3++) {
var pair = _Array$from[_i3];
var rank = this.bpeRanks.get(pair);
minPairs[isNaN(rank) ? 1e11 : rank] = pair;
}
var bigram = minPairs[Math.min.apply(Math, Object.keys(minPairs).map(function (x) {
return parseInt(x);
}))];
if (!this.bpeRanks.has(bigram)) {
break;
}
var first = bigram[0];
var second = bigram[1];
var newWord = [];
var i = 0;
while (i < word.length) {
var j = word.indexOf(first, i);
if (j === -1) {
newWord = newWord.concat(word.slice(i));
break;
}
newWord = newWord.concat(word.slice(i, j));
i = j;
if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
newWord.push(first + second);
i = i + 2;
} else {
newWord.push(word[i]);
i = i + 1;
}
}
word = newWord;
if (word.length === 1) {
break;
} else {
pairs = this.getPairs(word);
}
}
word = word.join(' ');
this.cache[token] = word;
return word;
};
_proto.encode = function encode(text) {
var _this2 = this;
var bpeTokens = [];
var texts = [];
var matches = text.match(bpeRegex) || [];
for (var _iterator = _createForOfIteratorHelperLoose(matches), _step; !(_step = _iterator()).done;) {
var token = _step.value;
token = Array.from(this.encodeUtf8(token)).map(function (x) {
return _this2.byteEncoder.get(x);
}).join('');
var newTokens = this.bpe(token).split(' ').map(function (x) {
return _this2.encodings[x];
});
bpeTokens = bpeTokens.concat(newTokens);
texts = texts.concat(newTokens.map(function (x) {
return _this2.decode([x]);
}));
}
return {
bpe: bpeTokens,
text: texts
};
};
_proto.decode = function decode(tokens) {
var _this3 = this;
var text = tokens.map(function (x) {
return _this3.decodings[x];
}).join('');
return this.decodeUtf8(new Uint8Array(text.split('').map(function (x) {
return _this3.byteDecoder.get(x);
})));
};
return GPT3Tokenizer;
}();
var GPT3BrowserTokenizer = /*#__PURE__*/function (_GPT3Tokenizer) {
_inheritsLoose(GPT3BrowserTokenizer, _GPT3Tokenizer);
function GPT3BrowserTokenizer(options) {
var _this;
_this = _GPT3Tokenizer.call(this, options) || this;
_this.textEncoder = new TextEncoder();
_this.textDecoder = new TextDecoder();
return _this;
}
var _proto = GPT3BrowserTokenizer.prototype;
_proto.encodeUtf8 = function encodeUtf8(text) {
return this.textEncoder.encode(text);
};
_proto.decodeUtf8 = function decodeUtf8(bytes) {
return this.textDecoder.decode(bytes);
};
return GPT3BrowserTokenizer;
}(GPT3Tokenizer);
export default GPT3BrowserTokenizer;
//# sourceMappingURL=gpt3-tokenizer.js.map
export {
GPT3BrowserTokenizer,
GPT3Tokenizer,
}