Merge ebolam's model-plugins branch

2025-06-05 21:59:24 +02:00 · 2023-05-28 09:26:13 +02:00
parent c32932998d f1d0be3a87
commit d71a63fa49
33 changed files with 3503 additions and 1631 deletions
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -1,25 +1,230 @@
-import os
+import os, sys
 from typing import Optional
+<<<<<<< HEAD
 from hf_bleeding_edge import AutoConfig

+=======
+from transformers import AutoConfig
+import warnings
+>>>>>>> ebolam/Model_Plugins
 import utils
+import json
 import koboldai_settings
 from logger import logger
 from modeling.inference_model import InferenceModel
+import torch
+import gc


 class HFInferenceModel(InferenceModel):
-    def __init__(self, model_name: str) -> None:
+    def __init__(self) -> None:
        super().__init__()
        self.model_config = None
-        self.model_name = model_name
+        #self.model_name = model_name

        self.model = None
        self.tokenizer = None
+        self.badwordsids = koboldai_settings.badwordsids_default
+        self.usegpu = False
+
+    def is_valid(self, model_name, model_path, menu_path):
+        try:
+            if model_path is not None and os.path.exists(model_path):
+                self.model_config = AutoConfig.from_pretrained(model_path)
+            elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
+                self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
+            else:
+                self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
+            return True
+        except:
+            return False
+        
+    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        requested_parameters = []
+        if not self.hf_torch:
+            return []
+        if model_name == 'customhuggingface':
+            requested_parameters.append({
+                                        "uitype": "text",
+                                        "unit": "text",
+                                        "label": "Huggingface Model Name",
+                                        "id": "custom_model_name",
+                                        "default": parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else "",
+                                        "check": {"value": "", 'check': "!="},
+                                        "tooltip": "Model name from https://huggingface.co/",
+                                        "menu_path": "",
+                                        "refresh_model_inputs": True,
+                                        "extra_classes": ""
+                                    })
+        
+        if model_name != 'customhuggingface' or "custom_model_name" in parameters:
+            model_name = parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else model_name
+            if model_path is not None and os.path.exists(model_path):
+                self.model_config = AutoConfig.from_pretrained(model_path)
+            elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
+                self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
+            else:
+                self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
+            layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
+            layer_count = None if hasattr(self, "get_model_type") and self.get_model_type() == "gpt2" else layer_count #Skip layers if we're a GPT2 model as it doesn't support breakmodel
+            if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
+                if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+                    with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                        temp = json.load(f)
+                        break_values = temp['layers'] if 'layers' in temp else [layer_count]
+                        disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
+                else:
+                    break_values = [layer_count]
+                    disk_blocks = 0
+                
+                break_values = [int(x) for x in break_values if x != '' and x is not None]
+                gpu_count = torch.cuda.device_count()
+                break_values += [0] * (gpu_count - len(break_values))
+                if disk_blocks is not None:
+                    break_values += [int(disk_blocks)]
+                requested_parameters.append({
+                                                "uitype": "Valid Display",
+                                                "unit": "text",
+                                                "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
+                                                "id": "valid_layers",
+                                                "max": layer_count,
+                                                "step": 1,
+                                                "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
+                                                "menu_path": "Layers",
+                                                "extra_classes": "",
+                                                "refresh_model_inputs": False
+                                            })
+                for i in range(gpu_count):
+                    requested_parameters.append({
+                                                    "uitype": "slider",
+                                                    "unit": "int",
+                                                    "label": "{} Layers".format(torch.cuda.get_device_name(i)),
+                                                    "id": "{}_Layers".format(i),
+                                                    "min": 0,
+                                                    "max": layer_count,
+                                                    "step": 1,
+                                                    "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
+                                                    "check_message": "The sum of assigned layers must equal {}".format(layer_count),
+                                                    "default": break_values[i],
+                                                    "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
+                                                    "menu_path": "Layers",
+                                                    "extra_classes": "",
+                                                    "refresh_model_inputs": False
+                                                })
+                requested_parameters.append({
+                                                "uitype": "slider",
+                                                "unit": "int",
+                                                "label": "CPU Layers",
+                                                "id": "CPU_Layers",
+                                                "min": 0,
+                                                "max": layer_count,
+                                                "step": 1,
+                                                "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
+                                                "check_message": "The sum of assigned layers must equal {}".format(layer_count),
+                                                "default": layer_count - sum(break_values),
+                                                "tooltip": "The number of layers to put on the CPU. This will use your system RAM. It will also do inference partially on CPU. Use if you must.",
+                                                "menu_path": "Layers",
+                                                "extra_classes": "",
+                                                "refresh_model_inputs": False
+                                            })
+                if disk_blocks is not None:
+                    requested_parameters.append({
+                                                    "uitype": "slider",
+                                                    "unit": "int",
+                                                    "label": "Disk Layers",
+                                                    "id": "Disk_Layers",
+                                                    "min": 0,
+                                                    "max": layer_count,
+                                                    "step": 1,
+                                                    "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
+                                                    "check_message": "The sum of assigned layers must equal {}".format(layer_count),
+                                                    "default": disk_blocks,
+                                                    "tooltip": "The number of layers to put on the disk. This will use your hard drive. The is VERY slow in comparison to GPU or CPU. Use as a last resort.",
+                                                    "menu_path": "Layers",
+                                                    "extra_classes": "",
+                                                    "refresh_model_inputs": False
+                                                })
+            else:
+                requested_parameters.append({
+                                                "uitype": "toggle",
+                                                "unit": "bool",
+                                                "label": "Use GPU",
+                                                "id": "use_gpu",
+                                                "default": True,
+                                                "tooltip": "Whether or not to use the GPU",
+                                                "menu_path": "Layers",
+                                                "extra_classes": "",
+                                                "refresh_model_inputs": False
+                                            })
+                                            
+        
+        return requested_parameters
+        
+    def set_input_parameters(self, parameters):
+        if self.hf_torch and hasattr(self, "get_model_type") and self.get_model_type() != "gpt2":
+            import breakmodel
+            layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
+            if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
+                gpu_count = torch.cuda.device_count()
+                layers = []
+                for i in range(gpu_count):
+                    if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
+                        layers.append(int(parameters["{}_Layers".format(i)]))
+                    elif isinstance(parameters["{}_Layers".format(i)], str):
+                         layers.append(None)
+                    else:
+                        layers.append(parameters["{}_Layers".format(i)])
+                self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None
+                if isinstance(self.cpu_layers, str):
+                    self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0
+                self.layers = layers
+                self.disk_layers = parameters['Disk_Layers'] if 'Disk_Layers' in parameters else 0    
+                if isinstance(self.disk_layers, str):
+                    self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0
+                breakmodel.gpu_blocks = layers
+                breakmodel.disk_blocks = self.disk_layers
+                self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0
+            self.model_type = self.get_model_type()
+            self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel
+            self.lazy_load = True
+            logger.debug("Model type: {}".format(self.model_type))
+        else:
+            logger.debug("Disabling breakmodel and lazyload")
+            self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
+            self.breakmodel = False
+            self.lazy_load = False
+        logger.info(parameters)
+        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
+        self.path = parameters['path'] if 'path' in parameters else None
+
+    def unload(self):
+        if hasattr(self, 'model'):
+            self.model = None
+        if hasattr(self, 'tokenizer'):
+            self.tokenizer = None
+        if hasattr(self, 'model_config'):
+            self.model_config = None
+        with torch.no_grad():
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
+                for tensor in gc.get_objects():
+                    try:
+                        if torch.is_tensor(tensor):
+                            tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
+                    except:
+                        pass
+        gc.collect()
+        try:
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+        except:
+            pass

    def _post_load(self) -> None:
+        self.badwordsids = koboldai_settings.badwordsids_default
+        self.model_type = str(self.model_config.model_type)
        # These are model specific tokenizer overrides if a model has bad defaults
-        if utils.koboldai_vars.model_type == "llama":
+        if self.model_type == "llama":
            # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
            self.tokenizer.add_bos_token = False

@@ -59,7 +264,7 @@ class HFInferenceModel(InferenceModel):
                        token_ids = [first]
                    elif len(token_ids) > 0:
                        first = int(token_ids[0])
-                elif token_ids:
+                elif token_ids is not None and len(token_ids) > 0:
                    first = token_ids[0]
                result = original_decode(self, token_ids, *args, **kwargs)
                if first is not None and first in has_prefix_space:
@@ -103,32 +308,32 @@ class HFInferenceModel(InferenceModel):
                return result
            object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))

-        elif utils.koboldai_vars.model_type == "opt":
+        elif self.model_type == "opt":
            self.tokenizer._koboldai_header = self.tokenizer.encode("")
            self.tokenizer.add_bos_token = False
            self.tokenizer.add_prefix_space = False

        # Change newline behavior to match model quirks
-        if utils.koboldai_vars.model_type == "xglm":
+        if self.model_type == "xglm":
            # Default to </s> newline mode if using XGLM
            utils.koboldai_vars.newlinemode = "s"
-        elif utils.koboldai_vars.model_type in ["opt", "bloom"]:
+        elif self.model_type in ["opt", "bloom"]:
            # Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them
            utils.koboldai_vars.newlinemode = "ns"

        # Clean up tokens that cause issues
        if (
-            utils.koboldai_vars.badwordsids == koboldai_settings.badwordsids_default
-            and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
+            self.badwordsids == koboldai_settings.badwordsids_default
+            and self.model_type not in ("gpt2", "gpt_neo", "gptj")
        ):
-            utils.koboldai_vars.badwordsids = [
+            self.badwordsids = [
                [v]
                for k, v in self.tokenizer.get_vocab().items()
                if any(c in str(k) for c in "[]")
            ]

            if utils.koboldai_vars.newlinemode == "n":
-                utils.koboldai_vars.badwordsids.append([self.tokenizer.eos_token_id])
+                self.badwordsids.append([self.tokenizer.eos_token_id])

        return super()._post_load()

@@ -139,9 +344,12 @@ class HFInferenceModel(InferenceModel):
        Returns a string of the model's path locally, or None if it is not downloaded.
        If ignore_existance is true, it will always return a path.
        """
+        if self.path is not None:
+            if os.path.exists(self.path):
+                return self.path

        if self.model_name in ["NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]:
-            model_path = utils.koboldai_vars.custmodpth
+            model_path = self.path
            assert model_path

            # Path can be absolute or relative to models directory
@@ -158,7 +366,7 @@ class HFInferenceModel(InferenceModel):

            return model_path

-        basename = utils.koboldai_vars.model.replace("/", "_")
+        basename = self.model_name.replace("/", "_")
        if legacy:
            ret = basename
        else:
@@ -176,24 +384,25 @@ class HFInferenceModel(InferenceModel):
                revision=utils.koboldai_vars.revision,
                cache_dir="cache",
            )
-            utils.koboldai_vars.model_type = self.model_config.model_type
+
+            self.model_type = self.model_config.model_type

            if "gptq_bits" in dir(self.model_config):
-                utils.koboldai_vars.gptq_model = True
-                utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
-                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
-                utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
-                utils.koboldai_vars.gptq_file = None
+                self.gptq_model = True
+                self.gptq_bits = self.model_config.gptq_bits
+                self.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
+                self.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
+                self.gptq_file = None
            else:
-                utils.koboldai_vars.gptq_model = False
+                self.gptq_model = False
        except ValueError:
-            utils.koboldai_vars.model_type = {
+            self.model_type = {
                "NeoCustom": "gpt_neo",
                "GPT2Custom": "gpt2",
-            }.get(utils.koboldai_vars.model)
+            }.get(self.model)

-            if not utils.koboldai_vars.model_type:
+            if not self.model_type:
                logger.warning(
                    "No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)"
                )
-                utils.koboldai_vars.model_type = "gpt_neo"
+                self.model_type = "gpt_neo"