From 9e51a50bcd6c4e287d4cd27a9c3a8303f8416cf4 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 01:08:53 +0200
Subject: [PATCH 01/10] Llama fixes for Mistral

---
 modeling/inference_models/hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 7e291b93..8cb52d69 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -232,7 +232,7 @@ class HFInferenceModel(InferenceModel):
         self.model_type = str(self.model_config.model_type)
         
         # These are model specific tokenizer overrides if a model has bad defaults
-        if self.model_type == "llama":
+        if self.model_type == "llama" or self.model_type == "mistral":
             # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
             self.tokenizer.add_bos_token = False
             self.tokenizer.legacy = False

From 6fdf83aad5f93559f8c332c7c2521e6058f09883 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 01:48:38 +0200
Subject: [PATCH 02/10] Basic Exllama2

---
 modeling/inference_models/exllamav2/class.py | 417 +++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 modeling/inference_models/exllamav2/class.py

diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py
new file mode 100644
index 00000000..95795506
--- /dev/null
+++ b/modeling/inference_models/exllamav2/class.py
@@ -0,0 +1,417 @@
+from __future__ import annotations
+try:
+    import time, json
+    import torch
+    import requests
+    import numpy as np
+    from typing import List, Optional, Union
+    import os
+    import glob
+    from pathlib import Path
+    import re
+    import warnings
+    import gc
+
+    import utils
+    from logger import logger
+
+    from modeling import warpers
+    from modeling.warpers import Warper
+    from modeling.stoppers import Stoppers
+    from modeling.post_token_hooks import PostTokenHooks
+    from modeling.inference_model import (
+        GenerationResult,
+        GenerationSettings,
+        InferenceModel,
+        ModelCapabilities,
+    )
+
+    from modeling.tokenizer import GenericTokenizer
+
+
+    from exllamav2.model import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
+    from transformers import LlamaTokenizer
+    from exllamav2.generator import ExLlamaV2StreamingGenerator
+    load_failed = False
+except:
+    load_failed = True
+
+model_backend_type = "Exl2"
+model_backend_name = "ExLlama V2"
+
+# When set to true, messages will appear in the console if samplers are not
+# changing the scores. Keep in mind some samplers don't always change the
+# scores for each token.
+LOG_SAMPLER_NO_EFFECT = False
+
+class model_backend(InferenceModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.model_config = None
+
+        self.model = None
+        self.tokenizer = None
+        self.cache = None
+        self.generator = None
+
+        self.model_name = ""
+        self.path = None
+
+        self.post_token_hooks = [
+            PostTokenHooks.stream_tokens,
+        ]
+
+        self.stopper_hooks = [
+            Stoppers.core_stopper,
+            Stoppers.dynamic_wi_scanner,
+            Stoppers.singleline_stopper,
+            Stoppers.chat_mode_stopper,
+            Stoppers.stop_sequence_stopper,
+        ]
+
+        self.capabilties = ModelCapabilities(
+            embedding_manipulation=False,
+            post_token_hooks=True,
+            stopper_hooks=True,
+            post_token_probs=False,
+        )
+        self.disable = load_failed
+
+    def is_valid(self, model_name, model_path, menu_path):
+        try:
+            self.model_config = self._load_config(model_name, model_path)
+            #TODO check if model is valid
+            return True
+        except:
+            return False
+
+    def get_local_model_path(self):
+        return self.path or os.path.join("models", self.model_name.replace("/", "_"))
+
+    def _load_config(self, model_name, model_path):
+        config = ExLlamaV2Config()
+        if model_path is not None and os.path.exists(model_path):
+            config.model_dir = model_path
+        elif os.path.exists("models/{}".format(model_name.replace('/', '_'))):
+            config.model_dir = "models/{}".format(model_name.replace('/', '_'))
+        config.prepare()
+
+        return config
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        self.model = self._get_model(self.get_local_model_path(), {})
+        #TODO support GPU split
+        self.model.load(None)
+        self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+
+        self.cache = ExLlamaV2Cache(self.model)
+
+        self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer.tokenizer)
+
+    def _post_load(self) -> None:
+        # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
+        self.tokenizer.add_bos_token = False
+
+        # HF transformers no longer supports decode_with_prefix_space
+        # We work around this by wrapping decode, encode, and __call__
+        # with versions that work around the 'prefix space' misfeature
+        # of sentencepiece.
+        vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
+        has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
+
+        # Wrap 'decode' with a method that always returns text starting with a space
+        # when the head token starts with a space. This is what 'decode_with_prefix_space'
+        # used to do, and we implement it using the same technique (building a cache of
+        # tokens that should have a prefix space, and then prepending a space if the first
+        # token is in this set.) We also work around a bizarre behavior in which decoding
+        # a single token 13 behaves differently than decoding a squence containing only [13].
+        original_decode = type(self.tokenizer.tokenizer).decode
+        def decode_wrapper(self, token_ids, *args, **kwargs):
+            first = None
+            # Note, the code below that wraps single-value token_ids in a list
+            # is to work around this wonky behavior:
+            #   >>> t.decode(13)
+            #   '<0x0A>'
+            #   >>> t.decode([13])
+            #   '\n'
+            # Not doing this causes token streaming to receive <0x0A> characters
+            # instead of newlines.
+            if isinstance(token_ids, int):
+                first = token_ids
+                token_ids = [first]
+            elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
+                # Tensors don't support the Python standard of 'empty is False'
+                # and the special case of dimension 0 tensors also needs to be
+                # handled separately.
+                if token_ids.dim() == 0:
+                    first = int(token_ids.item())
+                    token_ids = [first]
+                elif len(token_ids) > 0:
+                    first = int(token_ids[0])
+            elif token_ids is not None and len(token_ids) > 0:
+                first = token_ids[0]
+            result = original_decode(self, token_ids, *args, **kwargs)
+            if first is not None and first in has_prefix_space:
+                result = " " + result
+            return result
+        # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
+        object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
+
+        # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
+        # The problem is that "Bob" at the start of text is encoded as if it is
+        # " Bob". This creates a problem because it means you can't split text, encode
+        # the pieces, concatenate the tokens, decode them, and get the original text back.
+        # The workaround is to prepend a known token that (1) starts with a space; and
+        # (2) is not the prefix of any other token. After searching through the vocab
+        # " ," (space comma) is the only token containing only printable ascii characters
+        # that fits this bill. By prepending ',' to the text, the original encode
+        # method always returns [1919, ...], where the tail of the sequence is the
+        # actual encoded result we want without the prefix space behavior.
+        original_encode = type(self.tokenizer.tokenizer).encode
+        def encode_wrapper(self, text, *args, **kwargs):
+            if type(text) is str:
+                text = ',' + text
+                result = original_encode(self, text, *args, **kwargs)
+                result = result[1:]
+            else:
+                result = original_encode(self, text, *args, **kwargs)
+            return result
+        object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
+
+        # Since 'encode' is documented as being deprecated, also override __call__.
+        # This doesn't appear to currently be used by KoboldAI, but doing so
+        # in case someone uses it in the future.
+        original_call = type(self.tokenizer.tokenizer).__call__
+        def call_wrapper(self, text, *args, **kwargs):
+            if type(text) is str:
+                text = ',' + text
+                result = original_call(self, text, *args, **kwargs)
+                result = result[1:]
+            else:
+                result = original_call(self, text, *args, **kwargs)
+            return result
+        object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
+
+        # Cache the newline token (for single line mode)
+        # Since there is only one Llama token containing newline, just encode \n
+        self.newline_tokens = self.tokenizer.encode("\n")
+        self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok]
+        self.tokenizer._koboldai_header = self.tokenizer.encode("")
+
+    def unload(self):
+        #self.model_config = None # This breaks more than it fixes - Henk
+
+        self.model = None
+        self.tokenizer = None
+        self.cache = None
+        self.generator = None
+
+        self.model_name = ""
+        self.path = None
+
+        with torch.no_grad():
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
+                for tensor in gc.get_objects():
+                    try:
+                        if torch.is_tensor(tensor):
+                            tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
+                    except:
+                        pass
+        gc.collect()
+        try:
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+        except:
+            pass
+
+    def _apply_warpers(
+        self, scores: torch.Tensor, input_ids: torch.Tensor
+    ) -> torch.Tensor:
+        warpers.update_settings()
+
+        if LOG_SAMPLER_NO_EFFECT:
+            pre = torch.Tensor(scores)
+
+        for sid in utils.koboldai_vars.sampler_order:
+            warper = Warper.from_id(sid)
+
+            if not warper.value_is_valid():
+                continue
+
+            if warper == warpers.RepetitionPenalty:
+                # Rep pen needs more data than other samplers
+                scores = warper.torch(scores, input_ids=input_ids.cuda())
+            else:
+                scores = warper.torch(scores)
+
+            assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
+
+            if LOG_SAMPLER_NO_EFFECT:
+                if torch.equal(pre, scores):
+                    logger.info(warper, "had no effect on the scores.")
+                pre = torch.Tensor(scores)
+        return scores
+
+    def _raw_generate(
+        self,
+        prompt_tokens: Union[List[int], torch.Tensor],
+        max_new: int,
+        gen_settings: GenerationSettings,
+        single_line: bool = False,
+        batch_count: int = 1,
+        seed: Optional[int] = None,
+        **kwargs,
+    ) -> GenerationResult:
+        if seed:
+            torch.manual_seed(seed)
+
+        bad_words_ids = [self.tokenizer.bos_token_id]
+        if utils.koboldai_vars.use_default_badwordsids:
+            bad_words_ids.append(self.tokenizer.eos_token_id)
+            bad_words_ids.extend(self.bracket_tokens)
+        if single_line:
+            bad_words_ids.extend(self.newline_tokens)
+
+        if not isinstance(prompt_tokens, torch.Tensor):
+            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
+        else:
+            gen_in = prompt_tokens
+
+        self.generator._gen_begin_reuse(gen_in, None)
+
+        for i in range(max_new):
+            logits = self.model.forward(self.generator.sequence_ids[:, -1:], self.generator.cache)
+            for bad_word_id in bad_words_ids:
+                logits[:, :, bad_word_id] = -10000.0
+
+            logits = torch.unsqueeze(logits[0, -1, :], 0)
+
+            scores = self._apply_warpers(logits, gen_in)
+
+            scores = torch.softmax(scores, dim=-1)
+
+            # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841)
+            # With low probability, multinomial can return an element with zero weight. Since this
+            # happens infrequently, just sample repeatedly until all tokens have non-zero probability.
+            for _ in range(100):
+                token = torch.multinomial(scores, 1)
+                # Verify that all selected tokens correspond to positive probabilities.
+                if (scores.gather(1, token) > 0).all():
+                    break
+
+            if (token == self.tokenizer.eos_token_id).any():
+                break
+
+            if self.generator.sequence_ids is None:
+                self.generator.sequence_ids = token
+            else:
+                self.generator.sequence_ids = torch.cat([self.generator.sequence_ids, token.cpu()], dim=1)
+
+            self._post_token_gen(self.generator.sequence_ids)
+
+            utils.koboldai_vars.generated_tkns += 1
+
+            # Apply stoppers
+            do_stop = False
+            for stopper in self.stopper_hooks:
+                do_stop = stopper(self, self.generator.sequence_ids)
+                if do_stop:
+                    break
+            if do_stop:
+                break
+
+        seq = self.generator.sequence_ids[:, gen_in.size(1):]
+
+        return GenerationResult(
+            model=self,
+            out_batches=np.array(seq,),
+            prompt=prompt_tokens,
+            is_whole_generation=True,
+            single_line=single_line,
+        )
+
+    def _get_model(self, location: str, tf_kwargs: Dict):
+        if not self.model_config:
+            self.model_config = ExLlamaV2Config()
+            self.model_config.model_dir = location
+            self.model_config.prepare()
+
+        # self.model_config.gpu_peer_fix = True
+        return ExLlamaV2(self.model_config)
+
+    def _get_tokenizer(self, location: str):
+        tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
+        return tokenizer
+
+    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        requested_parameters = []
+        gpu_count = torch.cuda.device_count()
+        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "int",
+            "label": "Maximum Context",
+            "id": "max_ctx",
+            "min": 2048,
+            "max": 16384,
+            "step": 512,
+            "default": 2048,
+            "tooltip": "The maximum context size the model supports",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "float",
+            "label": "Embedding Compression",
+            "id": "compress_emb",
+            "min": 1,
+            "max": 8,
+            "step": 0.25,
+            "default": 1,
+            "tooltip": "If the model requires compressed embeddings, set them here",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "float",
+            "label": "NTK alpha",
+            "id": "ntk_alpha",
+            "min": 1,
+            "max": 32,
+            "step": 0.25,
+            "default": 1,
+            "tooltip": "NTK alpha value",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        return requested_parameters
+
+    def set_input_parameters(self, parameters):
+        gpu_count = torch.cuda.device_count()
+
+        self.model_config.max_seq_len = parameters["max_ctx"]
+        self.model_config.compress_pos_emb = parameters["compress_emb"]
+        self.model_config.alpha_value = parameters["ntk_alpha"]
+
+        # Disable half2 for HIP
+        self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
+        self.model_config.rope_no_half2 = bool(torch.version.hip)
+        self.model_config.matmul_no_half2 = bool(torch.version.hip)
+        self.model_config.silu_no_half2 = bool(torch.version.hip)
+
+        # Disable scaled_dot_product_attention if torch version < 2
+        if torch.__version__.startswith("1."):
+            self.model_config.sdp_thd = 0
+
+        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
+        self.path = parameters['path'] if 'path' in parameters else None
\ No newline at end of file

From e99789a51b01887e2ed5ac300afc33f9b38de7f2 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 03:19:03 +0200
Subject: [PATCH 03/10] Flash Attention (Linux)

---
 environments/huggingface.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 7f834906..682e8051 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -1,7 +1,7 @@
 name: koboldai
 channels:
   - pytorch
-  - nvidia
+  - nvidia/label/cuda-11.8.0
   - conda-forge
   - defaults
 dependencies:
@@ -13,6 +13,7 @@ dependencies:
   - pytorch=2.0.*
   - python=3.8.*
   - pytorch-cuda=11.8
+  - cuda-nvcc=11.8
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown
@@ -59,3 +60,4 @@ dependencies:
     - windows-curses; sys_platform == 'win32'
     - pynvml
     - xformers==0.0.21
+    - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiTRUE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'

From 04cc322d7cdc066f12287e2faa550b9d54b7cc9a Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 03:24:01 +0200
Subject: [PATCH 04/10] New dependencies

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 65e68b07..c76044ed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -47,3 +47,6 @@ https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
 windows-curses; sys_platform == 'win32'
 pynvml
+flash_attn==2.3.0
+xformers==0.0.21
+exllamav2==0.0.4
\ No newline at end of file

From e238a1c9f6775756e7eec5d4bc0a057364a806ab Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 09:09:04 +0200
Subject: [PATCH 05/10] Ship exllamav2

---
 environments/huggingface.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 682e8051..265e813f 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -14,6 +14,7 @@ dependencies:
   - python=3.8.*
   - pytorch-cuda=11.8
   - cuda-nvcc=11.8
+  - cuda-libraries-dev=11.8
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown
@@ -57,6 +58,8 @@ dependencies:
     - scipy
     - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
     - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
+    - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
+    - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
     - windows-curses; sys_platform == 'win32'
     - pynvml
     - xformers==0.0.21

From fa8d9e65ffea258e5723f098023826d7f481f5bb Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 09:51:09 +0200
Subject: [PATCH 06/10] Fix flash-attn

---
 environments/huggingface.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 265e813f..3d031194 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -63,4 +63,4 @@ dependencies:
     - windows-curses; sys_platform == 'win32'
     - pynvml
     - xformers==0.0.21
-    - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiTRUE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
+    - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'

From b141ee015566ad1206e93d995acba9dbc213ad64 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Thu, 28 Sep 2023 18:01:34 +0200
Subject: [PATCH 07/10] Universal downloader for more backends

---
 modeling/inference_models/basic_hf/class.py  | 7 +++++++
 modeling/inference_models/exllama/class.py   | 6 ++++++
 modeling/inference_models/exllamav2/class.py | 7 ++++++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/modeling/inference_models/basic_hf/class.py b/modeling/inference_models/basic_hf/class.py
index afca13ee..5ae2aa0d 100644
--- a/modeling/inference_models/basic_hf/class.py
+++ b/modeling/inference_models/basic_hf/class.py
@@ -148,6 +148,13 @@ class model_backend(InferenceModel):
                 self.get_local_model_path(ignore_existance=True),
             )
 
+        if not self.get_local_model_path():
+            print(self.get_local_model_path())
+            from huggingface_hub import snapshot_download
+            target_dir = "models/" + self.model_name.replace("/", "_")
+            print(self.model_name)
+            snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
+            
         self.init_model_config()
 
         self.model = AutoModelForCausalLM.from_pretrained(
diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 4539b7a3..f688d611 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -128,6 +128,12 @@ class model_backend(InferenceModel):
         return config
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
+        if not self.get_local_model_path():
+            from huggingface_hub import snapshot_download
+            target_dir = "models/" + self.model_name.replace("/", "_")
+            print(self.model_name)
+            snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
+            
         self.model = self._get_model(self.get_local_model_path(), {})
         self.tokenizer = self._get_tokenizer(self.get_local_model_path())
 
diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py
index 95795506..15b91c8d 100644
--- a/modeling/inference_models/exllamav2/class.py
+++ b/modeling/inference_models/exllamav2/class.py
@@ -36,7 +36,7 @@ try:
 except:
     load_failed = True
 
-model_backend_type = "Exl2"
+model_backend_type = "GPTQ"
 model_backend_name = "ExLlama V2"
 
 # When set to true, messages will appear in the console if samplers are not
@@ -99,6 +99,11 @@ class model_backend(InferenceModel):
         return config
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
+        if not self.get_local_model_path():
+            from huggingface_hub import snapshot_download
+            target_dir = "models/" + self.model_name.replace("/", "_")
+            print(self.model_name)
+            snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
         self.model = self._get_model(self.get_local_model_path(), {})
         #TODO support GPU split
         self.model.load(None)

From 02d3b00ff120d7dc8b1bd61e2e3e629c78c69184 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Wed, 4 Oct 2023 00:18:21 +0200
Subject: [PATCH 08/10] HF 4.34

---
 environments/huggingface.yml | 8 ++++----
 environments/ipex.yml        | 6 +++---
 environments/rocm.yml        | 7 ++++---
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 3d031194..8bba67be 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -33,9 +33,9 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers[sentencepiece]==4.33.1
+    - transformers[sentencepiece]==4.34.0
     - huggingface_hub==0.16.4
-    - optimum[onnxruntime]==1.12.0
+    - optimum[onnxruntime]==1.13.2
     - safetensors==0.3.3
     - accelerate==0.21.0
     - git+https://github.com/VE-FORBRYDERNE/mkultra
@@ -51,8 +51,8 @@ dependencies:
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
     - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
-    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
-    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
+    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
+    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
     - einops
     - peft==0.3.0
     - scipy
diff --git a/environments/ipex.yml b/environments/ipex.yml
index 73b6025d..2dd0bb3e 100644
--- a/environments/ipex.yml
+++ b/environments/ipex.yml
@@ -33,7 +33,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers[sentencepiece]==4.33.1
+    - transformers[sentencepiece]==4.34.0
     - huggingface_hub==0.16.4
     - optimum[openvino,nncf,neural-compressor]==1.12.0
     - safetensors==0.3.3
@@ -49,8 +49,8 @@ dependencies:
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
     - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
-    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
-    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
+    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
+    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
     - einops
     - peft==0.3.0
     - scipy
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 5b56bef7..814a682f 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -29,9 +29,9 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers[sentencepiece]==4.33.1
+    - transformers[sentencepiece]==4.34.0
     - huggingface_hub==0.16.4
-    - optimum[onnxruntime]==1.12.0
+    - optimum[onnxruntime]==1.13.2
     - safetensors==0.3.3
     - accelerate==0.21.0
     - git+https://github.com/VE-FORBRYDERNE/mkultra
@@ -45,4 +45,5 @@ dependencies:
     - einops
     - peft==0.3.0
     - windows-curses; sys_platform == 'win32'
-    - pynvml
\ No newline at end of file
+    - pynvml
+    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+rocm5.4.2-cp38-cp38-linux_x86_64.whl
\ No newline at end of file

From caa040b416c3d3467d99de2a055e707e4af36398 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Wed, 4 Oct 2023 00:21:32 +0200
Subject: [PATCH 09/10] HF 4.34 part 2

---
 requirements.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c76044ed..5a19a292 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-transformers[sentencepiece]==4.33.1
+transformers[sentencepiece]==4.34.0
 huggingface_hub==0.16.4
-optimum[onnxruntime]==1.12.0
+optimum[onnxruntime]==1.13.2
 safetensors==0.3.3
 Flask==2.3.3
 Flask-SocketIO==5.3.2
@@ -41,10 +41,10 @@ git+https://github.com/0cc4m/hf_bleeding_edge/
 einops
 peft==0.3.0
 scipy
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10'
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10'
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8'
-https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10'
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10'
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8'
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
 windows-curses; sys_platform == 'win32'
 pynvml
 flash_attn==2.3.0

From 5a3986fb06266b4a84ac7a35e30782c709da6bab Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Wed, 4 Oct 2023 01:32:36 +0300
Subject: [PATCH 10/10] IPEX Optimizations

---
 environments/ipex.yml      |  4 ++--
 modeling/ipex/__init__.py  |  9 +++++++--
 modeling/ipex/attention.py | 33 +++++++++++++++++++--------------
 modeling/ipex/diffusers.py |  9 +++++----
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/environments/ipex.yml b/environments/ipex.yml
index 2dd0bb3e..88c1d140 100644
--- a/environments/ipex.yml
+++ b/environments/ipex.yml
@@ -23,7 +23,7 @@ dependencies:
   - Pillow
   - psutil
   - pip:
-    - -f https://developer.intel.com/ipex-whl-stable-xpu
+    - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
     - torch==2.0.1a0; sys_platform == 'linux'
     - torch==2.0.0a0; sys_platform == 'win32'
     - intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux'
@@ -35,7 +35,7 @@ dependencies:
     - lupa==1.10
     - transformers[sentencepiece]==4.34.0
     - huggingface_hub==0.16.4
-    - optimum[openvino,nncf,neural-compressor]==1.12.0
+    - optimum[onnxruntime,openvino,nncf,neural-compressor]==1.13.2
     - safetensors==0.3.3
     - accelerate==0.21.0
     - git+https://github.com/VE-FORBRYDERNE/mkultra
diff --git a/modeling/ipex/__init__.py b/modeling/ipex/__init__.py
index 9ec69012..43accd9f 100644
--- a/modeling/ipex/__init__.py
+++ b/modeling/ipex/__init__.py
@@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.device = torch.xpu.device
         torch.cuda.device_count = torch.xpu.device_count
         torch.cuda.device_of = torch.xpu.device_of
-        torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
         torch.cuda.get_device_name = torch.xpu.get_device_name
         torch.cuda.get_device_properties = torch.xpu.get_device_properties
         torch.cuda.init = torch.xpu.init
@@ -145,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements
         ipex._C._DeviceProperties.minor = 2
 
         #Fix functions with ipex:
-        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
+        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
         torch._utils._get_available_device_type = lambda: "xpu"
         torch.has_cuda = True
         torch.cuda.has_half = True
@@ -157,6 +156,12 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.get_device_properties.minor = 7
         torch.cuda.ipc_collect = lambda *args, **kwargs: None
         torch.cuda.utilization = lambda *args, **kwargs: 0
+        if hasattr(torch.xpu, 'getDeviceIdListForCard'):
+            torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
+            torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
+        else:
+            torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
+            torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card
 
         ipex_hijacks()
         attention_init()
diff --git a/modeling/ipex/attention.py b/modeling/ipex/attention.py
index e38689f2..84848b6a 100644
--- a/modeling/ipex/attention.py
+++ b/modeling/ipex/attention.py
@@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None):
 
     #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
     batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
-    block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
-    block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
+    block_multiply = input.element_size()
+    slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
+    block_size = batch_size_attention * slice_block_size
+
     split_slice_size = batch_size_attention
-    if block_size >= 4000:
+    if block_size > 4:
         do_split = True
         #Find something divisible with the input_tokens
-        while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
+        while (split_slice_size * slice_block_size) > 4:
             split_slice_size = split_slice_size // 2
             if split_slice_size <= 1:
                 split_slice_size = 1
@@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None):
     else:
         do_split = False
 
-    split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
     split_2_slice_size = input_tokens
-    if split_block_size >= 4000:
+    if split_slice_size * slice_block_size > 4:
+        slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
         do_split_2 = True
         #Find something divisible with the input_tokens
-        while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
+        while (split_2_slice_size * slice_block_size2) > 4:
             split_2_slice_size = split_2_slice_size // 2
             if split_2_slice_size <= 1:
                 split_2_slice_size = 1
@@ -71,13 +73,16 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     else:
         shape_one, batch_size_attention, query_tokens, shape_four = query.shape
         no_shape_one = False
-    block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
-    block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
+
+    block_multiply = query.element_size()
+    slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
+    block_size = batch_size_attention * slice_block_size
+
     split_slice_size = batch_size_attention
-    if block_size >= 4000:
+    if block_size > 4:
         do_split = True
         #Find something divisible with the shape_one
-        while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
+        while (split_slice_size * slice_block_size) > 4:
             split_slice_size = split_slice_size // 2
             if split_slice_size <= 1:
                 split_slice_size = 1
@@ -85,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     else:
         do_split = False
 
-    split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
     split_2_slice_size = query_tokens
-    if split_block_size >= 4000:
+    if split_slice_size * slice_block_size > 4:
+        slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
         do_split_2 = True
         #Find something divisible with the batch_size_attention
-        while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
+        while (split_2_slice_size * slice_block_size2) > 4:
             split_2_slice_size = split_2_slice_size // 2
             if split_2_slice_size <= 1:
                 split_2_slice_size = 1
diff --git a/modeling/ipex/diffusers.py b/modeling/ipex/diffusers.py
index 4c39896e..005ee49f 100644
--- a/modeling/ipex/diffusers.py
+++ b/modeling/ipex/diffusers.py
@@ -55,13 +55,14 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
         )
 
         #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
-        block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
+        block_multiply = query.element_size()
+        slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply
+        block_size = query_tokens * slice_block_size
         split_2_slice_size = query_tokens
-        if block_size >= 4000:
+        if block_size > 4:
             do_split_2 = True
             #Find something divisible with the query_tokens
-            while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
+            while (split_2_slice_size * slice_block_size) > 4:
                 split_2_slice_size = split_2_slice_size // 2
                 if split_2_slice_size <= 1:
                     split_2_slice_size = 1