From 070cfd339a27ee1ef62fd74a495fb34b80a4920c Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Sat, 19 Aug 2023 17:40:23 -0700 Subject: [PATCH 1/7] Strip the eos token from exllama generations. The end-of-sequence () token indicates the end of a generation. When a token sequence containing is decoded, an extra (wrong) space is inserted at the beginning of the generation. To avoid this, strip the eos token out of the result before returning it. The eos token was getting stripped later, so this doesn't change the output except to avoid the spurious leading space. --- modeling/inference_models/exllama/class.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index e3c7a874..508c6a79 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -282,6 +282,7 @@ class model_backend(InferenceModel): self.generator.gen_begin_reuse(gen_in) + trim_count = 0 for i in range(max_new): logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) logits[:, :, self.tokenizer.bos_token_id] = -10000.0 @@ -300,15 +301,19 @@ class model_backend(InferenceModel): utils.koboldai_vars.generated_tkns += 1 - if token.item() == self.tokenizer.eos_token_id: break + if token.item() == self.tokenizer.eos_token_id: + trim_count = 1 + break - utils.koboldai_vars.generated_tkns = max_new + utils.koboldai_vars.generated_tkns = max_new - trim_count + if trim_count > 0: + seq = self.generator.sequence[:, gen_in.size(1):-trim_count] + else: + seq = self.generator.sequence[:, gen_in.size(1):] return GenerationResult( model=self, - out_batches=np.array( - self.generator.sequence[:, gen_in.size(1):], - ), + out_batches=np.array(seq,), prompt=prompt_tokens, is_whole_generation=True, single_line=single_line, From b96d5d8646b320096f06fc65a291469d8ca9a5dd Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Tue, 22 Aug 2023 23:06:16 -0700 Subject: [PATCH 2/7] Add stopper hooks suppport to exllama --- modeling/inference_models/exllama/class.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 508c6a79..2540d3f4 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -91,7 +91,7 @@ class model_backend(InferenceModel): self.capabilties = ModelCapabilities( embedding_manipulation=False, post_token_hooks=True, - stopper_hooks=False, + stopper_hooks=True, post_token_probs=False, ) @@ -305,6 +305,15 @@ class model_backend(InferenceModel): trim_count = 1 break + # Apply stoppers + do_stop = False + for stopper in self.stopper_hooks: + do_stop = stopper(self, self.generator.sequence) + if do_stop: + break + if do_stop: + break + utils.koboldai_vars.generated_tkns = max_new - trim_count if trim_count > 0: seq = self.generator.sequence[:, gen_in.size(1):-trim_count] From b7e38b47570cb910d4b5b9c853985e6d3fba9107 Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Sat, 26 Aug 2023 22:26:26 -0700 Subject: [PATCH 3/7] Resample to work around a bug in torch.multinomial There is a bug in PyTorch 2.0.1 that allows torch.multinomial to sometimes choose elements that have zero probability. Since this is uncommon we can continue to use torch.multinomial as long as we verify that the results are valid. If they aren't, try again until the probability of each selected token is positive. --- modeling/inference_models/exllama/class.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 2540d3f4..3fb8d252 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -293,7 +293,14 @@ class model_backend(InferenceModel): scores = torch.softmax(scores, dim=-1) - token = torch.multinomial(scores, 1) + # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841) + # With low probability, multinomial can return an element with zero weight. Since this + # happens infrequently, just sample repeatedly until all tokens have non-zero probability. + for _ in range(100): + token = torch.multinomial(scores, 1) + # Verify that all selected tokens correspond to positive probabilities. + if (scores.gather(1, token) > 0).all(): + break self.generator.gen_accept_token(token) @@ -301,7 +308,7 @@ class model_backend(InferenceModel): utils.koboldai_vars.generated_tkns += 1 - if token.item() == self.tokenizer.eos_token_id: + if (token == self.tokenizer.eos_token_id).any(): trim_count = 1 break From 08ff7c138c35e344819acbd82fa18e88732e08a4 Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Sun, 27 Aug 2023 16:34:52 -0700 Subject: [PATCH 4/7] Add the eos token to exllama bad words. The bos token was already hardcoded as a bad word id. Store badwords in a list and iterate over them during generation. Add the Llama eos token to the list of bad words. Also support "single line mode", which adds newline (13) to badwords. --- modeling/inference_models/exllama/class.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 3fb8d252..737afa88 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -95,6 +95,9 @@ class model_backend(InferenceModel): post_token_probs=False, ) + # We need to wait until the tokenizer is available to fill this in. + self.badwordsids = [] + def is_valid(self, model_name, model_path, menu_path): gptq_model, _ = load_model_gptq_settings(model_path) try: @@ -119,6 +122,7 @@ class model_backend(InferenceModel): self.model = self._get_model(self.get_local_model_path(), {}) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + self.badwordsids = [self.tokenizer.bos_token_id, self.tokenizer.eos_token_id] self.cache = ExLlamaCache(self.model) self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache) @@ -207,6 +211,10 @@ class model_backend(InferenceModel): return result object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) + # Cache the newline token (for single line mode) + # Since there is only one Llama token containing newline, just encode \n + self.newline_tokens = self.tokenizer.encode("\n") + def unload(self): self.model_config = None @@ -275,6 +283,10 @@ class model_backend(InferenceModel): if seed: torch.manual_seed(seed) + bad_words_ids = self.badwordsids + if single_line: + bad_words_ids = list(bad_words_ids) + self.newline_tokens + if not isinstance(prompt_tokens, torch.Tensor): gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] else: @@ -285,7 +297,8 @@ class model_backend(InferenceModel): trim_count = 0 for i in range(max_new): logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) - logits[:, :, self.tokenizer.bos_token_id] = -10000.0 + for bad_word_id in bad_words_ids: + logits[:, :, bad_word_id] = -10000.0 logits = torch.unsqueeze(logits[0, -1, :], 0) From 554af7b1754fa2e574fbbcfa2a612b13969bda63 Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Sun, 27 Aug 2023 23:56:02 -0700 Subject: [PATCH 5/7] Modify exllama to load unrenamed gptq quantized models Read config.json and enable exllama loading if the model has a `quantization_config` with `quant_methdod` of `gptq`. Note that this implementation is limited and only supports model.safetensors. That said, this supports loading popular gptq quantized models without renaming or symlinking the model file. --- modeling/inference_models/exllama/class.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 737afa88..67f54073 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -49,9 +49,16 @@ def load_model_gptq_settings(path): gptq_model = False gptq_file = False + gptq_in_config = False + + try: + if js['quantization_config']['quant_method'] == "gptq": + gptq_in_config = True + except: + pass gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors")) - if "gptq_bits" in js: + if "gptq_bits" in js or gptq_in_config: gptq_model = True gptq_file = os.path.join(path, "model.safetensors") elif gptq_legacy_files: From 2c48e05f7c2eb053e7a2d196f3b24fd5335492ab Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Mon, 28 Aug 2023 09:52:31 -0700 Subject: [PATCH 6/7] Add exllama dependency back to requirements. --- environments/huggingface.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 004c7ecc..a7e43892 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -54,3 +54,5 @@ dependencies: - einops - peft==0.3.0 - scipy + - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html + - exllama==0.0.6 From d6ed75f9938e81072362856ad2649a20aa5f59a5 Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Tue, 29 Aug 2023 23:08:51 -0700 Subject: [PATCH 7/7] Hook up use_default_badwordids in exllama Use the value of the use_default_badwordids setting to configure bad_words_ids. Also add square brackets to bad_words_ids if the use_default_badwordids setting is True. Fix an issue with attempting to use the tokenizer too early, and fix an exception populating Lua bridge data when zero tokens are generated, which can now happen if use_default_badwordids is False and the first token generated is EOS. --- aiserver.py | 3 ++- modeling/inference_models/exllama/class.py | 28 +++++++++------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/aiserver.py b/aiserver.py index 40ff9c5a..429d66c1 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3918,7 +3918,8 @@ def generate(txt, minimum, maximum, found_entries=None, gen_mode=GenerationMode. return for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(genout[i, -1].item()) + if len(genout[i]) > 0: + koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(genout[i, -1].item()) koboldai_vars.lua_koboldbridge.outputs[i+1] = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:])) execute_outmod() diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 67f54073..a93d3dee 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -102,9 +102,6 @@ class model_backend(InferenceModel): post_token_probs=False, ) - # We need to wait until the tokenizer is available to fill this in. - self.badwordsids = [] - def is_valid(self, model_name, model_path, menu_path): gptq_model, _ = load_model_gptq_settings(model_path) try: @@ -129,7 +126,6 @@ class model_backend(InferenceModel): self.model = self._get_model(self.get_local_model_path(), {}) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - self.badwordsids = [self.tokenizer.bos_token_id, self.tokenizer.eos_token_id] self.cache = ExLlamaCache(self.model) self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache) @@ -221,6 +217,8 @@ class model_backend(InferenceModel): # Cache the newline token (for single line mode) # Since there is only one Llama token containing newline, just encode \n self.newline_tokens = self.tokenizer.encode("\n") + self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok] + self.tokenizer._koboldai_header = self.tokenizer.encode("") def unload(self): self.model_config = None @@ -290,9 +288,12 @@ class model_backend(InferenceModel): if seed: torch.manual_seed(seed) - bad_words_ids = self.badwordsids + bad_words_ids = [self.tokenizer.bos_token_id] + if utils.koboldai_vars.use_default_badwordids: + bad_words_ids.append(self.tokenizer.eos_token_id) + bad_words_ids.extend(self.bracket_tokens) if single_line: - bad_words_ids = list(bad_words_ids) + self.newline_tokens + bad_words_ids.extend(self.newline_tokens) if not isinstance(prompt_tokens, torch.Tensor): gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] @@ -301,7 +302,6 @@ class model_backend(InferenceModel): self.generator.gen_begin_reuse(gen_in) - trim_count = 0 for i in range(max_new): logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) for bad_word_id in bad_words_ids: @@ -322,16 +322,15 @@ class model_backend(InferenceModel): if (scores.gather(1, token) > 0).all(): break + if (token == self.tokenizer.eos_token_id).any(): + break + self.generator.gen_accept_token(token) self._post_token_gen(self.generator.sequence) utils.koboldai_vars.generated_tkns += 1 - if (token == self.tokenizer.eos_token_id).any(): - trim_count = 1 - break - # Apply stoppers do_stop = False for stopper in self.stopper_hooks: @@ -341,11 +340,7 @@ class model_backend(InferenceModel): if do_stop: break - utils.koboldai_vars.generated_tkns = max_new - trim_count - if trim_count > 0: - seq = self.generator.sequence[:, gen_in.size(1):-trim_count] - else: - seq = self.generator.sequence[:, gen_in.size(1):] + seq = self.generator.sequence[:, gen_in.size(1):] return GenerationResult( model=self, @@ -365,7 +360,6 @@ class model_backend(InferenceModel): def _get_tokenizer(self, location: str): tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location)) - tokenizer._koboldai_header = tokenizer.encode("") return tokenizer def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):