From 6b172306f623d37c94fa0add67c1ee1798e25d3f Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 21 Jun 2022 13:15:46 -0400 Subject: [PATCH 1/6] move_model_to_devices no longer crashes if you don't have accelerate --- aiserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index e3e9f758..de6be946 100644 --- a/aiserver.py +++ b/aiserver.py @@ -633,8 +633,9 @@ def move_model_to_devices(model): generator = model.generate return + import breakmodel + if(utils.HAS_ACCELERATE): - import breakmodel disk_blocks = breakmodel.disk_blocks gpu_blocks = breakmodel.gpu_blocks ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) From ea7d278ff4360e11bc249c76184ba2e8e646b9dc Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 21 Jun 2022 13:16:45 -0400 Subject: [PATCH 2/6] Fix 20B TPU model --- tpu_mtj_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index bc228998..db31b902 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1119,6 +1119,7 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo return old_encode(s).ids return encode tokenizer.encode = new_encode(tokenizer.encode) + tokenizer._koboldai_header = [] elif not hf_checkpoint: if not isinstance(params["tokenizer_class"], str) or not any(params["tokenizer_class"].endswith(s) for s in ("Tokenizer", "TokenizerFast")): raise ValueError("`tokenizer_class` must be a string ending in 'Tokenizer' or 'TokenizerFast'") From 0ea4fa9c87a0ec3c6faa40c8214ca97c3c605579 Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 21 Jun 2022 14:35:52 -0400 Subject: [PATCH 3/6] Automatically calculate badwords and pad_token_id --- aiserver.py | 12 +++++++----- tpu_mtj_backend.py | 7 ++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/aiserver.py b/aiserver.py index de6be946..dc16660a 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1604,9 +1604,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") vars.model_type = "gpt_neo" - if(vars.model_type == "opt"): - vars.badwordsids = vars.badwordsids_opt - if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): loadmodelsettings() loadsettings() @@ -1998,6 +1995,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename)) shutil.rmtree("cache/") + if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj", "xglm")): + vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in k for c in "<>[]")] + patch_causallm(model.__class__) if(vars.hascuda): @@ -2148,8 +2148,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and (not vars.custmodpth or not os.path.isdir(vars.custmodpth)): raise FileNotFoundError(f"The specified model path {repr(vars.custmodpth)} is not the path to a valid folder") import tpu_mtj_backend - if(vars.model == "TPUMeshTransformerGPTNeoX" or vars.model_type == "opt"): - tpu_mtj_backend.pad_token_id = 1 + if(vars.model == "TPUMeshTransformerGPTNeoX"): + tpu_mtj_backend.pad_token_id = 2 tpu_mtj_backend.vars = vars tpu_mtj_backend.warper_callback = tpumtjgenerate_warper_callback tpu_mtj_backend.stopping_callback = tpumtjgenerate_stopping_callback @@ -2162,6 +2162,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tpu_mtj_backend.load_model(vars.custmodpth, hf_checkpoint=vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and vars.use_colab_tpu, **vars.modelconfig) vars.modeldim = int(tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])) tokenizer = tpu_mtj_backend.tokenizer + if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj", "xglm")): + vars.badwordsids = [[str(v)] for k, v in tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]")] else: loadsettings() diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index db31b902..a0e017d3 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1018,7 +1018,12 @@ def read_neox_checkpoint(state, path, config, checkpoint_shards=2): def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpoint=False, **kwargs) -> None: - global thread_resources_env, seq, tokenizer, network, params + global thread_resources_env, seq, tokenizer, network, params, pad_token_id + + if "pad_token_id" in kwargs: + pad_token_id = kwargs["pad_token_id"] + elif "eos_token_id" in kwargs: + pad_token_id = kwargs["eos_token_id"] if not hasattr(vars, "sampler_order") or not vars.sampler_order: vars.sampler_order = utils.default_sampler_order.copy() From 91643be10a49b5b9362dfcef86576f57e99ba383 Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 21 Jun 2022 15:03:43 -0400 Subject: [PATCH 4/6] Change soft prompt implementation to a more universal one --- aiserver.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/aiserver.py b/aiserver.py index dc16660a..8f543865 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1247,18 +1247,20 @@ def get_oai_models(key): # Function to patch transformers to use our soft prompt -def patch_causallm(cls): - if(getattr(cls, "_koboldai_patch_causallm_patched", False)): - return - old_forward = cls.forward - def new_causallm_forward(self, *args, **kwargs): - input_ids = kwargs.get('input_ids').to(self.device) +def patch_causallm(model): + from torch.nn import Embedding + if(getattr(Embedding, "_koboldai_patch_causallm_model", None)): + Embedding._koboldai_patch_causallm_model = model + return model + old_embedding_call = Embedding.__call__ + def new_embedding_call(self, input_ids, *args, **kwargs): + if(Embedding._koboldai_patch_causallm_model.get_input_embeddings() is not self): + return old_embedding_call(self, input_ids, *args, **kwargs) assert input_ids is not None - kwargs['input_ids'] = None if(vars.sp is not None): - shifted_input_ids = input_ids - self.config.vocab_size - input_ids.clamp_(max=self.config.vocab_size-1) - inputs_embeds = self.get_input_embeddings()(input_ids) + shifted_input_ids = input_ids - model.config.vocab_size + input_ids.clamp_(max=model.config.vocab_size-1) + inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) if(vars.sp is not None): vars.sp = vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device) inputs_embeds = torch.where( @@ -1266,13 +1268,10 @@ def patch_causallm(cls): vars.sp[shifted_input_ids.clamp(min=0)], inputs_embeds, ) - if(hasattr(self, "model") and hasattr(self.model, "embed_scale")): - inputs_embeds *= self.model.embed_scale - kwargs['inputs_embeds'] = inputs_embeds - return old_forward(self, *args, **kwargs) - cls.forward = new_causallm_forward - cls._koboldai_patch_causallm_patched = True - return cls + return inputs_embeds + Embedding.__call__ = new_embedding_call + Embedding._koboldai_patch_causallm_model = model + return model def patch_transformers(): @@ -1864,7 +1863,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal else: model = model.to('cpu').float() generator = model.generate - patch_causallm(model.__class__) + patch_causallm(model) # Use the Generic implementation else: lowmem = maybe_low_cpu_mem_usage() @@ -1998,7 +1997,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj", "xglm")): vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in k for c in "<>[]")] - patch_causallm(model.__class__) + patch_causallm(model) if(vars.hascuda): if(vars.usegpu): From 7e0ded6b476404f407b55be644c1cf13f3154785 Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 21 Jun 2022 15:12:55 -0400 Subject: [PATCH 5/6] Typo fix --- aiserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index 8f543865..a0d4d688 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1994,7 +1994,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename)) shutil.rmtree("cache/") - if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj", "xglm")): + if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj")): vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in k for c in "<>[]")] patch_causallm(model) @@ -2161,7 +2161,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tpu_mtj_backend.load_model(vars.custmodpth, hf_checkpoint=vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and vars.use_colab_tpu, **vars.modelconfig) vars.modeldim = int(tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])) tokenizer = tpu_mtj_backend.tokenizer - if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj", "xglm")): + if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj")): vars.badwordsids = [[str(v)] for k, v in tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]")] else: loadsettings() From 8593bf339b9d387ac77daceeee4fdcc234cf5f8f Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 21 Jun 2022 15:36:25 -0400 Subject: [PATCH 6/6] Another typo fix --- aiserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index a0d4d688..4bbd89b0 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1995,7 +1995,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal shutil.rmtree("cache/") if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj")): - vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in k for c in "<>[]")] + vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]")] patch_causallm(model) @@ -2162,7 +2162,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal vars.modeldim = int(tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])) tokenizer = tpu_mtj_backend.tokenizer if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj")): - vars.badwordsids = [[str(v)] for k, v in tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]")] + vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]")] else: loadsettings()