From 070cfd339a27ee1ef62fd74a495fb34b80a4920c Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Sat, 19 Aug 2023 17:40:23 -0700
Subject: [PATCH 1/7] Strip the eos token from exllama generations.

The end-of-sequence (</s>) token indicates the end of a generation.
When a token sequence containing </s> is decoded, an extra (wrong)
space is inserted at the beginning of the generation. To avoid this,
strip the eos token out of the result before returning it.
The eos token was getting stripped later, so this doesn't change
the output except to avoid the spurious leading space.
---
 modeling/inference_models/exllama/class.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index e3c7a874..508c6a79 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -282,6 +282,7 @@ class model_backend(InferenceModel):
 
         self.generator.gen_begin_reuse(gen_in)
 
+        trim_count = 0
         for i in range(max_new):
             logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
             logits[:, :, self.tokenizer.bos_token_id] = -10000.0
@@ -300,15 +301,19 @@ class model_backend(InferenceModel):
 
             utils.koboldai_vars.generated_tkns += 1
 
-            if token.item() == self.tokenizer.eos_token_id: break
+            if token.item() == self.tokenizer.eos_token_id:
+                trim_count = 1
+                break
 
-        utils.koboldai_vars.generated_tkns = max_new
+        utils.koboldai_vars.generated_tkns = max_new - trim_count
+        if trim_count > 0:
+            seq = self.generator.sequence[:, gen_in.size(1):-trim_count]
+        else:
+            seq = self.generator.sequence[:, gen_in.size(1):]
 
         return GenerationResult(
             model=self,
-            out_batches=np.array(
-                self.generator.sequence[:, gen_in.size(1):],
-            ),
+            out_batches=np.array(seq,),
             prompt=prompt_tokens,
             is_whole_generation=True,
             single_line=single_line,

From b96d5d8646b320096f06fc65a291469d8ca9a5dd Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Tue, 22 Aug 2023 23:06:16 -0700
Subject: [PATCH 2/7] Add stopper hooks suppport to exllama

---
 modeling/inference_models/exllama/class.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 508c6a79..2540d3f4 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -91,7 +91,7 @@ class model_backend(InferenceModel):
         self.capabilties = ModelCapabilities(
             embedding_manipulation=False,
             post_token_hooks=True,
-            stopper_hooks=False,
+            stopper_hooks=True,
             post_token_probs=False,
         )
 
@@ -305,6 +305,15 @@ class model_backend(InferenceModel):
                 trim_count = 1
                 break
 
+            # Apply stoppers
+            do_stop = False
+            for stopper in self.stopper_hooks:
+                do_stop = stopper(self, self.generator.sequence)
+                if do_stop:
+                    break
+            if do_stop:
+                break
+
         utils.koboldai_vars.generated_tkns = max_new - trim_count
         if trim_count > 0:
             seq = self.generator.sequence[:, gen_in.size(1):-trim_count]

From b7e38b47570cb910d4b5b9c853985e6d3fba9107 Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:26:26 -0700
Subject: [PATCH 3/7] Resample to work around a bug in torch.multinomial

There is a bug in PyTorch 2.0.1 that allows torch.multinomial to
sometimes choose elements that have zero probability. Since
this is uncommon we can continue to use torch.multinomial as
long as we verify that the results are valid. If they aren't,
try again until the probability of each selected token is positive.
---
 modeling/inference_models/exllama/class.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 2540d3f4..3fb8d252 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -293,7 +293,14 @@ class model_backend(InferenceModel):
 
             scores = torch.softmax(scores, dim=-1)
 
-            token = torch.multinomial(scores, 1)
+            # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841)
+            # With low probability, multinomial can return an element with zero weight. Since this
+            # happens infrequently, just sample repeatedly until all tokens have non-zero probability.
+            for _ in range(100):
+                token = torch.multinomial(scores, 1)
+                # Verify that all selected tokens correspond to positive probabilities.
+                if (scores.gather(1, token) > 0).all():
+                    break
 
             self.generator.gen_accept_token(token)
 
@@ -301,7 +308,7 @@ class model_backend(InferenceModel):
 
             utils.koboldai_vars.generated_tkns += 1
 
-            if token.item() == self.tokenizer.eos_token_id:
+            if (token == self.tokenizer.eos_token_id).any():
                 trim_count = 1
                 break
 

From 08ff7c138c35e344819acbd82fa18e88732e08a4 Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Sun, 27 Aug 2023 16:34:52 -0700
Subject: [PATCH 4/7] Add the eos token to exllama bad words.

The bos token was already hardcoded as a bad word id.
Store badwords in a list and iterate over them during generation.
Add the Llama eos token to the list of bad words.
Also support "single line mode", which adds newline (13) to badwords.
---
 modeling/inference_models/exllama/class.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 3fb8d252..737afa88 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -95,6 +95,9 @@ class model_backend(InferenceModel):
             post_token_probs=False,
         )
 
+        # We need to wait until the tokenizer is available to fill this in.
+        self.badwordsids = []
+
     def is_valid(self, model_name, model_path, menu_path):
         gptq_model, _ = load_model_gptq_settings(model_path)
         try:
@@ -119,6 +122,7 @@ class model_backend(InferenceModel):
         self.model = self._get_model(self.get_local_model_path(), {})
         self.tokenizer = self._get_tokenizer(self.get_local_model_path())
 
+        self.badwordsids = [self.tokenizer.bos_token_id, self.tokenizer.eos_token_id]
         self.cache = ExLlamaCache(self.model)
 
         self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache)
@@ -207,6 +211,10 @@ class model_backend(InferenceModel):
             return result
         object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
 
+        # Cache the newline token (for single line mode)
+        # Since there is only one Llama token containing newline, just encode \n
+        self.newline_tokens = self.tokenizer.encode("\n")
+
     def unload(self):
         self.model_config = None
 
@@ -275,6 +283,10 @@ class model_backend(InferenceModel):
         if seed:
             torch.manual_seed(seed)
 
+        bad_words_ids = self.badwordsids
+        if single_line:
+            bad_words_ids = list(bad_words_ids) + self.newline_tokens
+
         if not isinstance(prompt_tokens, torch.Tensor):
             gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
         else:
@@ -285,7 +297,8 @@ class model_backend(InferenceModel):
         trim_count = 0
         for i in range(max_new):
             logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
-            logits[:, :, self.tokenizer.bos_token_id] = -10000.0
+            for bad_word_id in bad_words_ids:
+                logits[:, :, bad_word_id] = -10000.0
 
             logits = torch.unsqueeze(logits[0, -1, :], 0)
 

From 554af7b1754fa2e574fbbcfa2a612b13969bda63 Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Sun, 27 Aug 2023 23:56:02 -0700
Subject: [PATCH 5/7] Modify exllama to load unrenamed gptq quantized models

Read config.json and enable exllama loading if the model has a
`quantization_config` with `quant_methdod` of `gptq`. Note that this
implementation is limited and only supports model.safetensors.
That said, this supports loading popular gptq quantized models
without renaming or symlinking the model file.
---
 modeling/inference_models/exllama/class.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 737afa88..67f54073 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -49,9 +49,16 @@ def load_model_gptq_settings(path):
 
     gptq_model = False
     gptq_file = False
+    gptq_in_config = False
+
+    try:
+        if js['quantization_config']['quant_method'] == "gptq":
+            gptq_in_config = True
+    except:
+        pass
 
     gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors"))
-    if "gptq_bits" in js:
+    if "gptq_bits" in js or gptq_in_config:
         gptq_model = True
         gptq_file = os.path.join(path, "model.safetensors")
     elif gptq_legacy_files:

From 2c48e05f7c2eb053e7a2d196f3b24fd5335492ab Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Mon, 28 Aug 2023 09:52:31 -0700
Subject: [PATCH 6/7] Add exllama dependency back to requirements.

---
 environments/huggingface.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 004c7ecc..a7e43892 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -54,3 +54,5 @@ dependencies:
     - einops
     - peft==0.3.0
     - scipy
+    - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
+    - exllama==0.0.6

From d6ed75f9938e81072362856ad2649a20aa5f59a5 Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Tue, 29 Aug 2023 23:08:51 -0700
Subject: [PATCH 7/7] Hook up use_default_badwordids in exllama

Use the value of the use_default_badwordids setting to configure
bad_words_ids. Also add square brackets to bad_words_ids if the
use_default_badwordids setting is True. Fix an issue with
attempting to use the tokenizer too early, and fix an exception
populating Lua bridge data when zero tokens are generated, which
can now happen if use_default_badwordids is False and the first
token generated is EOS.
---
 aiserver.py                                |  3 ++-
 modeling/inference_models/exllama/class.py | 28 +++++++++-------------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 40ff9c5a..429d66c1 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3918,7 +3918,8 @@ def generate(txt, minimum, maximum, found_entries=None, gen_mode=GenerationMode.
         return
 
     for i in range(koboldai_vars.numseqs):
-        koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(genout[i, -1].item())
+        if len(genout[i]) > 0:
+            koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(genout[i, -1].item())
         koboldai_vars.lua_koboldbridge.outputs[i+1] = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:]))
 
     execute_outmod()
diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 67f54073..a93d3dee 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -102,9 +102,6 @@ class model_backend(InferenceModel):
             post_token_probs=False,
         )
 
-        # We need to wait until the tokenizer is available to fill this in.
-        self.badwordsids = []
-
     def is_valid(self, model_name, model_path, menu_path):
         gptq_model, _ = load_model_gptq_settings(model_path)
         try:
@@ -129,7 +126,6 @@ class model_backend(InferenceModel):
         self.model = self._get_model(self.get_local_model_path(), {})
         self.tokenizer = self._get_tokenizer(self.get_local_model_path())
 
-        self.badwordsids = [self.tokenizer.bos_token_id, self.tokenizer.eos_token_id]
         self.cache = ExLlamaCache(self.model)
 
         self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache)
@@ -221,6 +217,8 @@ class model_backend(InferenceModel):
         # Cache the newline token (for single line mode)
         # Since there is only one Llama token containing newline, just encode \n
         self.newline_tokens = self.tokenizer.encode("\n")
+        self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok]
+        self.tokenizer._koboldai_header = self.tokenizer.encode("")
 
     def unload(self):
         self.model_config = None
@@ -290,9 +288,12 @@ class model_backend(InferenceModel):
         if seed:
             torch.manual_seed(seed)
 
-        bad_words_ids = self.badwordsids
+        bad_words_ids = [self.tokenizer.bos_token_id]
+        if utils.koboldai_vars.use_default_badwordids:
+            bad_words_ids.append(self.tokenizer.eos_token_id)
+            bad_words_ids.extend(self.bracket_tokens)
         if single_line:
-            bad_words_ids = list(bad_words_ids) + self.newline_tokens
+            bad_words_ids.extend(self.newline_tokens)
 
         if not isinstance(prompt_tokens, torch.Tensor):
             gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
@@ -301,7 +302,6 @@ class model_backend(InferenceModel):
 
         self.generator.gen_begin_reuse(gen_in)
 
-        trim_count = 0
         for i in range(max_new):
             logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
             for bad_word_id in bad_words_ids:
@@ -322,16 +322,15 @@ class model_backend(InferenceModel):
                 if (scores.gather(1, token) > 0).all():
                     break
 
+            if (token == self.tokenizer.eos_token_id).any():
+                break
+
             self.generator.gen_accept_token(token)
 
             self._post_token_gen(self.generator.sequence)
 
             utils.koboldai_vars.generated_tkns += 1
 
-            if (token == self.tokenizer.eos_token_id).any():
-                trim_count = 1
-                break
-
             # Apply stoppers
             do_stop = False
             for stopper in self.stopper_hooks:
@@ -341,11 +340,7 @@ class model_backend(InferenceModel):
             if do_stop:
                 break
 
-        utils.koboldai_vars.generated_tkns = max_new - trim_count
-        if trim_count > 0:
-            seq = self.generator.sequence[:, gen_in.size(1):-trim_count]
-        else:
-            seq = self.generator.sequence[:, gen_in.size(1):]
+        seq = self.generator.sequence[:, gen_in.size(1):]
 
         return GenerationResult(
             model=self,
@@ -365,7 +360,6 @@ class model_backend(InferenceModel):
 
     def _get_tokenizer(self, location: str):
         tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
-        tokenizer._koboldai_header = tokenizer.encode("")
         return tokenizer
 
     def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):