Pull upstream changes, fix conflicts

2025-06-05 21:59:24 +02:00 · 2023-07-15 23:01:52 +02:00
parent ed7ad00b59 0622810bc4
commit e78361fc8f
33 changed files with 3868 additions and 3147 deletions
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -1,15 +1,13 @@
 from __future__ import annotations
+from dataclasses import dataclass

-import gc
 import os
 import time
 import bisect
-import zipfile
-import functools
 import itertools
 import traceback
 import contextlib
-from tqdm.auto import tqdm
+from torch import nn
 from typing import Dict, List, Optional, Union

 import torch
@@ -39,19 +37,52 @@ from modeling.inference_model import (
    use_core_manipulations,
 )

-try:
-    import breakmodel
-    import accelerate.utils
-except ModuleNotFoundError as e:
-    if not utils.koboldai_vars.use_colab_tpu:
-        raise e
-
 # When set to true, messages will appear in the console if samplers are not
 # changing the scores. Keep in mind some samplers don't always change the
 # scores for each token.
 LOG_SAMPLER_NO_EFFECT = False


+class BreakmodelConfig:
+    def __init__(self) -> None:
+        self.disk_blocks = 0
+        self.gpu_blocks = []
+
+    @property
+    def primary_device(self):
+        if utils.args.cpu:
+            return "cpu"
+        elif not sum(self.gpu_blocks):
+            # No blocks are on GPU
+            return "cpu"
+        elif torch.cuda.device_count() <= 0:
+            return "cpu"
+
+        for device_index, blocks in enumerate(self.gpu_blocks):
+            if blocks:
+                return device_index
+        return 0
+
+    def get_device_map(self, model: nn.Module) -> dict:
+        ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks)
+        cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks))
+        device_map = {}
+
+        for name in utils.layers_module_names:
+            layer = int(name.rsplit(".", 1)[1])
+            device = (
+                ("disk" if layer < self.disk_blocks else "cpu")
+                if layer < ram_blocks
+                else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
+            )
+            device_map[name] = device
+
+        for name in utils.get_missing_module_names(model, list(device_map.keys())):
+            device_map[name] = self.primary_device
+
+        return device_map
+
+
 class HFTorchInferenceModel(HFInferenceModel):
    def __init__(self) -> None:
        super().__init__()
@@ -79,6 +110,29 @@ class HFTorchInferenceModel(HFInferenceModel):
            post_token_probs=True,
        )
        self._old_stopping_criteria = None
+        self.breakmodel_config = BreakmodelConfig()
+
+    def set_input_parameters(self, parameters):
+        ret = super().set_input_parameters(parameters)
+
+        # Hook onto input param setting for setting breakmodel stuff
+        if self.breakmodel:
+            self.breakmodel_config.gpu_blocks = self.layers
+            self.breakmodel_config.disk_blocks = self.disk_layers
+
+        return ret
+
+    def get_auxilary_device(self) -> Union[str, int, torch.device]:
+        return self.breakmodel_config.primary_device
+
+    def _get_target_dtype(self) -> Union[torch.float16, torch.float32]:
+        if self.breakmodel_config.primary_device == "cpu":
+            return torch.float32
+        elif utils.args.cpu:
+            return torch.float32
+        elif not self.usegpu and not self.breakmodel:
+            return torch.float32
+        return torch.float16

    def _apply_warpers(
        self, scores: torch.Tensor, input_ids: torch.Tensor
@@ -125,19 +179,7 @@ class HFTorchInferenceModel(HFInferenceModel):
        else:
            return "Unknown"

-    def get_auxilary_device(self):
-        """Get device auxilary tensors like inputs should be stored on."""
-
-        # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU.
-        if utils.koboldai_vars.hascuda and self.usegpu:
-            return utils.koboldai_vars.gpu_device
-        elif utils.koboldai_vars.hascuda and self.breakmodel:
-            import breakmodel
-            return breakmodel.primary_device
-        return "cpu"
-
    def _post_load(m_self) -> None:
-
        if not utils.koboldai_vars.model_type:
            utils.koboldai_vars.model_type = m_self.get_model_type()

@@ -220,6 +262,40 @@ class HFTorchInferenceModel(HFInferenceModel):
        new_sample.old_sample = transformers.GenerationMixin.sample
        use_core_manipulations.sample = new_sample

+        # PEFT Loading. This MUST be done after all save_pretrained calls are
+        # finished on the main model.
+        if utils.args.peft:
+            from peft import PeftModel, PeftConfig
+            local_peft_dir = os.path.join(m_self.get_local_model_path(), "peft")
+
+            # Make PEFT dir if it doesn't exist
+            try:
+                os.makedirs(local_peft_dir)
+            except FileExistsError:
+                pass
+
+            peft_local_path = os.path.join(local_peft_dir, utils.args.peft.replace("/", "_"))
+            logger.debug(f"Loading PEFT '{utils.args.peft}', possible local path is '{peft_local_path}'.")
+
+            peft_installed_locally = True
+            possible_peft_locations = [peft_local_path, utils.args.peft]
+
+            for i, location in enumerate(possible_peft_locations):
+                try:
+                    m_self.model = PeftModel.from_pretrained(m_self.model, location)
+                    logger.debug(f"Loaded PEFT at '{location}'")
+                    break
+                except ValueError:
+                    peft_installed_locally = False
+                    if i == len(possible_peft_locations) - 1:
+                        raise RuntimeError(f"Unable to load PeftModel for given name '{utils.args.peft}'. Does it exist?")
+                except RuntimeError:
+                    raise RuntimeError("Error while loading PeftModel. Are you using the correct model?")
+
+            if not peft_installed_locally:
+                logger.debug(f"PEFT not saved to models folder; saving to '{peft_local_path}'")
+                m_self.model.save_pretrained(peft_local_path)
+
        return super()._post_load()

    def _raw_generate(
@@ -236,9 +312,11 @@ class HFTorchInferenceModel(HFInferenceModel):
            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
        else:
            gen_in = prompt_tokens
-
-        device = self.get_auxilary_device()
-        gen_in = gen_in.to(device)
+        if not self.usegpu and not self.breakmodel:
+            gen_in = gen_in.to("cpu")
+        else:
+            device = self.get_auxilary_device()
+            gen_in = gen_in.to(device)

        additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []

@@ -254,8 +332,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                    len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
                ),
                repetition_penalty=1.0,
-                bad_words_ids=self.badwordsids
-                + additional_bad_words_ids,
+                bad_words_ids=self.badwordsids + additional_bad_words_ids,
                use_cache=True,
                num_return_sequences=batch_count,
            )
@@ -275,6 +352,9 @@ class HFTorchInferenceModel(HFInferenceModel):
        tf_kwargs["revision"] = utils.koboldai_vars.revision
        tf_kwargs["cache_dir"] = "cache"

+        if self.lazy_load:
+            tf_kwargs.pop("low_cpu_mem_usage", None)
+
        # If we have model hints for legacy model, use them rather than fall back.
        try:
            if self.model_name == "GPT2Custom":
@@ -283,10 +363,63 @@ class HFTorchInferenceModel(HFInferenceModel):
                return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)
        except Exception as e:
            logger.warning(f"{self.model_name} is a no-go; {e} - Falling back to auto.")
+            if utils.args.panic:
+                raise

        # Try to determine model type from either AutoModel or falling back to legacy
        try:
-            return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs)
+            if self.lazy_load:
+                with lazy_loader.use_lazy_load(dematerialized_modules=True):
+                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                    if utils.args.cpu:
+                        cpu_map = {name: "cpu" for name in utils.layers_module_names}
+                        for name in utils.get_missing_module_names(
+                            metamodel, list(cpu_map.keys())
+                        ):
+                            cpu_map[name] = "cpu"
+                        tf_kwargs["device_map"] = cpu_map
+                    else:
+                        tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(
+                            metamodel
+                        )
+
+            try:
+                # Try to load with the lazyloader first...
+                with lazy_loader.use_lazy_load(
+                    enable=self.lazy_load,
+                    # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
+                    dematerialized_modules=False,
+                ):
+                    model = AutoModelForCausalLM.from_pretrained(
+                        location,
+                        offload_folder="accelerate-disk-cache",
+                        torch_dtype=self._get_target_dtype(),
+                        **tf_kwargs,
+                    )
+            except Exception as e:
+                # ...but fall back to stock HF if lazyloader fails.
+                if utils.args.panic:
+                    raise
+                logger.error("Lazyloader failed, falling back to stock HF load. You may run out of RAM here. Details:")
+                logger.error(e)
+                logger.error(traceback.format_exc())
+                logger.info("Falling back to stock HF load...")
+
+                model = AutoModelForCausalLM.from_pretrained(
+                    location,
+                    offload_folder="accelerate-disk-cache",
+                    torch_dtype=self._get_target_dtype(),
+                    **tf_kwargs,
+                )
+
+            if not self.lazy_load and not self.breakmodel:
+                # We need to move the model to the desired device
+                if (not self.usegpu) or torch.cuda.device_count() <= 0:
+                    model = model.to("cpu")
+                else:
+                    model = model.to("cuda")
+
+            return model
        except Exception as e:
            traceback_string = traceback.format_exc().lower()

@@ -300,6 +433,9 @@ class HFTorchInferenceModel(HFInferenceModel):
                logger.error("Invalid load key! Aborting.")
                raise

+            if utils.args.panic:
+                raise
+
            logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
            logger.debug(traceback.format_exc())

@@ -325,49 +461,6 @@ class HFTorchInferenceModel(HFInferenceModel):

        return True

-    def _move_to_devices(self) -> None:
-        for key, value in self.model.state_dict().items():
-            target_dtype = (
-                torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
-            )
-            if value.dtype is not target_dtype:
-                accelerate.utils.set_module_tensor_to_device(
-                    self.model,
-                    tensor_name=key,
-                    device=torch.device(value.device),
-                    value=value,
-                    dtype=target_dtype,
-                )
-
-        disk_blocks = breakmodel.disk_blocks
-        gpu_blocks = breakmodel.gpu_blocks
-        ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
-        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
-        device_map = {}
-
-        for name in utils.layers_module_names:
-            layer = int(name.rsplit(".", 1)[1])
-            device = (
-                ("disk" if layer < disk_blocks else "cpu")
-                if layer < ram_blocks
-                else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
-            )
-            device_map[name] = device
-
-        for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
-            device_map[name] = breakmodel.primary_device
-
-        breakmodel.dispatch_model_ex(
-            self.model,
-            device_map,
-            main_device=breakmodel.primary_device,
-            offload_buffers=True,
-            offload_dir="accelerate-disk-cache",
-        )
-
-        gc.collect()
-        return
-
    # Function to patch transformers to use our soft prompt
    def patch_embedding(self) -> None:
        if getattr(Embedding, "_koboldai_patch_causallm_model", None):
@@ -409,404 +502,20 @@ class HFTorchInferenceModel(HFInferenceModel):
        Embedding.__call__ = new_embedding_call
        Embedding._koboldai_patch_causallm_model = self.model

-    def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
-        if not self.lazy_load:
-            return
-
-
-        disk_blocks = breakmodel.disk_blocks
-        gpu_blocks = breakmodel.gpu_blocks
-        ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
-        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
-
-        def lazy_load_callback(
-            model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]],
-            f,
-            is_safetensors: bool = False,
-            **_,
-        ):
-            if lazy_load_callback.nested:
-                return
-            lazy_load_callback.nested = True
-
-            device_map: Dict[str, Union[str, int]] = {}
-
-            @functools.lru_cache(maxsize=None)
-            def get_original_key(key) -> Optional[str]:
-                try:
-                    key_candidates = [
-                        original_key
-                        for original_key in utils.module_names
-                        if original_key.endswith(key)
-                    ]
-                except ValueError:
-                    return key
-
-                if not key_candidates:
-                    logger.debug(f"!!! No key candidates for {key}")
-                    return None
-
-                return max(key_candidates, key=len)
-
-            for key, value in model_dict.items():
-                original_key = get_original_key(key)
-
-                if not original_key:
-                    continue
-
-                if isinstance(value, lazy_loader.LazyTensor) and not any(
-                    original_key.startswith(n) for n in utils.layers_module_names
-                ):
-                    device_map[key] = (
-                        utils.koboldai_vars.gpu_device
-                        if utils.koboldai_vars.hascuda and self.usegpu
-                        else "cpu"
-                        if not utils.koboldai_vars.hascuda
-                        or not self.breakmodel
-                        else breakmodel.primary_device
-                    )
-                else:
-                    layer = int(
-                        max(
-                            (
-                                n
-                                for n in utils.layers_module_names
-                                if original_key.startswith(n)
-                            ),
-                            key=len,
-                        ).rsplit(".", 1)[1]
-                    )
-                    device = (
-                        utils.koboldai_vars.gpu_device
-                        if utils.koboldai_vars.hascuda and self.usegpu
-                        else "disk"
-                        if layer < disk_blocks and layer < ram_blocks
-                        else "cpu"
-                        if not utils.koboldai_vars.hascuda
-                        or not self.breakmodel
-                        else "shared"
-                        if layer < ram_blocks
-                        else bisect.bisect_right(
-                            cumulative_gpu_blocks, layer - ram_blocks
-                        )
-                    )
-                    device_map[key] = device
-
-            if utils.num_shards is None or utils.current_shard == 0:
-                utils.offload_index = {}
-                if os.path.isdir("accelerate-disk-cache"):
-                    # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
-                    # (the folder doesn't contain any subfolders so os.remove will do just fine)
-                    for filename in os.listdir("accelerate-disk-cache"):
-                        try:
-                            os.remove(os.path.join("accelerate-disk-cache", filename))
-                        except OSError:
-                            pass
-                os.makedirs("accelerate-disk-cache", exist_ok=True)
-                if utils.num_shards is not None:
-                    num_tensors = len(
-                        utils.get_sharded_checkpoint_num_tensors(
-                            utils.from_pretrained_model_name,
-                            utils.from_pretrained_index_filename,
-                            is_safetensors=is_safetensors,
-                            **utils.from_pretrained_kwargs,
-                        )
-                    )
-                else:
-                    num_tensors = len(device_map)
-                print(flush=True)
-                utils.koboldai_vars.status_message = "Loading model"
-                utils.koboldai_vars.total_layers = num_tensors
-                utils.koboldai_vars.loaded_layers = 0
-                utils.bar = tqdm(
-                    total=num_tensors,
-                    desc="Loading model tensors",
-                    file=utils.UIProgressBarFile(),
-                    position=1
-                )
-
-            if not is_safetensors:
-                # Torch lazyload
-                with zipfile.ZipFile(f, "r") as z:
-                    try:
-                        last_storage_key = None
-                        zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0]
-                        f = None
-                        current_offset = 0
-                        able_to_pin_layers = True
-                        if utils.num_shards is not None:
-                            utils.current_shard += 1
-                        for key in sorted(
-                            device_map.keys(),
-                            key=lambda k: (
-                                model_dict[k].key,
-                                model_dict[k].seek_offset,
-                            ),
-                        ):
-                            storage_key = model_dict[key].key
-                            if (
-                                storage_key != last_storage_key
-                                or model_dict[key].seek_offset < current_offset
-                            ):
-                                last_storage_key = storage_key
-                                if isinstance(f, zipfile.ZipExtFile):
-                                    f.close()
-                                ziproot = z.namelist()[0].split("/")[0]
-                                f = z.open(f"{ziproot}/data/{storage_key}")
-
-                                current_offset = 0
-                            if current_offset != model_dict[key].seek_offset:
-                                f.read(model_dict[key].seek_offset - current_offset)
-                                current_offset = model_dict[key].seek_offset
-                            device = device_map[key]
-                            size = functools.reduce(
-                                lambda x, y: x * y, model_dict[key].shape, 1
-                            )
-                            dtype = model_dict[key].dtype
-                            nbytes = (
-                                size
-                                if dtype is torch.bool
-                                else size
-                                * (
-                                    (
-                                        torch.finfo
-                                        if dtype.is_floating_point
-                                        else torch.iinfo
-                                    )(dtype).bits
-                                    >> 3
-                                )
-                            )
-                            # print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
-                            #logger.debug(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
-                            model_dict[key] = model_dict[key].materialize(
-                                f, map_location="cpu"
-                            )
-                            if model_dict[key].dtype is torch.float32:
-                                utils.koboldai_vars.fp32_model = True
-                            if (
-                                convert_to_float16
-                                and breakmodel.primary_device != "cpu"
-                                and utils.koboldai_vars.hascuda
-                                and (
-                                    self.breakmodel
-                                    or self.usegpu
-                                )
-                                and model_dict[key].dtype is torch.float32
-                            ):
-                                model_dict[key] = model_dict[key].to(torch.float16)
-                            if breakmodel.primary_device == "cpu" or (
-                                not self.usegpu
-                                and not self.breakmodel
-                                and model_dict[key].dtype is torch.float16
-                            ):
-                                model_dict[key] = model_dict[key].to(torch.float32)
-                            if device == "shared":
-                                model_dict[key] = model_dict[key].to("cpu").detach_()
-                                if able_to_pin_layers:
-                                    try:
-                                        model_dict[key] = model_dict[key].pin_memory()
-                                    except:
-                                        able_to_pin_layers = False
-                            elif device == "disk":
-                                accelerate.utils.offload_weight(
-                                    model_dict[key],
-                                    get_original_key(key),
-                                    "accelerate-disk-cache",
-                                    index=utils.offload_index,
-                                )
-                                model_dict[key] = model_dict[key].to("meta")
-                            else:
-                                model_dict[key] = model_dict[key].to(device)
-                            # print("OK", flush=True)
-                            current_offset += nbytes
-                            utils.bar.update(1)
-                            utils.koboldai_vars.loaded_layers += 1
-                    finally:
-                        if (
-                            utils.num_shards is None
-                            or utils.current_shard >= utils.num_shards
-                        ):
-                            if utils.offload_index:
-                                for name, tensor in utils.named_buffers:
-                                    dtype = tensor.dtype
-                                    if (
-                                        convert_to_float16
-                                        and breakmodel.primary_device != "cpu"
-                                        and utils.koboldai_vars.hascuda
-                                        and (
-                                            self.breakmodel
-                                            or self.usegpu
-                                        )
-                                    ):
-                                        dtype = torch.float16
-                                    if breakmodel.primary_device == "cpu" or (
-                                        not self.usegpu
-                                        and not self.breakmodel
-                                    ):
-                                        dtype = torch.float32
-                                    if (
-                                        name in model_dict
-                                        and model_dict[name].dtype is not dtype
-                                    ):
-                                        model_dict[name] = model_dict[name].to(dtype)
-                                    if tensor.dtype is not dtype:
-                                        tensor = tensor.to(dtype)
-                                    if name not in utils.offload_index:
-                                        accelerate.utils.offload_weight(
-                                            tensor,
-                                            name,
-                                            "accelerate-disk-cache",
-                                            index=utils.offload_index,
-                                        )
-                                accelerate.utils.save_offload_index(
-                                    utils.offload_index, "accelerate-disk-cache"
-                                )
-                            utils.bar.close()
-                            utils.bar = None
-                            utils.koboldai_vars.status_message = ""
-                        lazy_load_callback.nested = False
-                        if isinstance(f, zipfile.ZipExtFile):
-                            f.close()
-            else:
-                # Loading with safetensors
-                try:
-                    able_to_pin_layers = True
-
-                    if utils.num_shards is not None:
-                        utils.current_shard += 1
-
-                    for key in sorted(
-                        device_map.keys(),
-                        key=lambda k: model_dict[k].key,
-                    ):
-                        storage_key = model_dict[key].key
-
-                        device = device_map[key]
-
-                        # print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
-
-                        model_dict[key] = model_dict[key].materialize(
-                            f, map_location="cpu"
-                        )
-
-                        if model_dict[key].dtype is torch.float32:
-                            utils.koboldai_vars.fp32_model = True
-
-                        if (
-                            convert_to_float16
-                            and breakmodel.primary_device != "cpu"
-                            and utils.koboldai_vars.hascuda
-                            and (
-                                self.breakmodel
-                                or self.usegpu
-                            )
-                            and model_dict[key].dtype is torch.float32
-                        ):
-                            model_dict[key] = model_dict[key].to(torch.float16)
-
-                        if breakmodel.primary_device == "cpu" or (
-                            not self.usegpu
-                            and not self.breakmodel
-                            and model_dict[key].dtype is torch.float16
-                        ):
-                            model_dict[key] = model_dict[key].to(torch.float32)
-
-                        if device == "shared":
-                            model_dict[key] = model_dict[key].to("cpu").detach_()
-                            if able_to_pin_layers:
-                                try:
-                                    model_dict[key] = model_dict[key].pin_memory()
-                                except:
-                                    able_to_pin_layers = False
-                        elif device == "disk":
-                            accelerate.utils.offload_weight(
-                                model_dict[key],
-                                get_original_key(key),
-                                "accelerate-disk-cache",
-                                index=utils.offload_index,
-                            )
-                            model_dict[key] = model_dict[key].to("meta")
-                        else:
-                            model_dict[key] = model_dict[key].to(device)
-
-                        utils.bar.update(1)
-                        utils.koboldai_vars.loaded_layers += 1
-
-                finally:
-                    if (
-                        utils.num_shards is None
-                        or utils.current_shard >= utils.num_shards
-                    ):
-                        if utils.offload_index:
-                            for name, tensor in utils.named_buffers:
-                                dtype = tensor.dtype
-                                if (
-                                    convert_to_float16
-                                    and breakmodel.primary_device != "cpu"
-                                    and utils.koboldai_vars.hascuda
-                                    and (
-                                        self.breakmodel
-                                        or self.usegpu
-                                    )
-                                ):
-                                    dtype = torch.float16
-                                if breakmodel.primary_device == "cpu" or (
-                                    not self.usegpu
-                                    and not self.breakmodel
-                                ):
-                                    dtype = torch.float32
-                                if (
-                                    name in model_dict
-                                    and model_dict[name].dtype is not dtype
-                                ):
-                                    model_dict[name] = model_dict[name].to(dtype)
-                                if tensor.dtype is not dtype:
-                                    tensor = tensor.to(dtype)
-                                if name not in utils.offload_index:
-                                    accelerate.utils.offload_weight(
-                                        tensor,
-                                        name,
-                                        "accelerate-disk-cache",
-                                        index=utils.offload_index,
-                                    )
-                            accelerate.utils.save_offload_index(
-                                utils.offload_index, "accelerate-disk-cache"
-                            )
-                        utils.bar.close()
-                        utils.bar = None
-                        utils.koboldai_vars.status_message = ""
-
-                    lazy_load_callback.nested = False
-
-        lazy_load_callback.nested = False
-        return lazy_load_callback
-
-    @contextlib.contextmanager
-    def _maybe_use_float16(self, always_use: bool = False):
-        if always_use or (
-            utils.koboldai_vars.hascuda
-            and self.low_mem
-            and (self.usegpu or self.breakmodel)
-        ):
-            original_dtype = torch.get_default_dtype()
-            torch.set_default_dtype(torch.float16)
-            yield True
-            torch.set_default_dtype(original_dtype)
-        else:
-            yield False
-
    def breakmodel_device_list(self, n_layers, primary=None, selected=None):
-        # TODO: Find a better place for this or rework this
-
        device_count = torch.cuda.device_count()
        if device_count < 2:
            primary = None
+
        logger.debug("n_layers: {}".format(n_layers))
-        logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
-        gpu_blocks = breakmodel.gpu_blocks + (
-            device_count - len(breakmodel.gpu_blocks)
+        logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks))
+
+        gpu_blocks = self.breakmodel_config.gpu_blocks + (
+            device_count - len(self.breakmodel_config.gpu_blocks)
        ) * [0]
+
        print(f"{Colors.YELLOW}       DEVICE ID  |  LAYERS  |  DEVICE NAME{Colors.END}")
+
        for i in range(device_count):
            name = torch.cuda.get_device_name(i)
            if len(name) > 47:
@@ -816,72 +525,83 @@ class HFTorchInferenceModel(HFInferenceModel):
            print(
                f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else '  '} {'(primary)' if i == primary else ' '*9} {i:3}  {sep_color}|{row_color}     {gpu_blocks[i]:3}  {sep_color}|{row_color}  {name}{Colors.END}"
            )
+
        row_color = Colors.END
        sep_color = Colors.YELLOW
        print(
-            f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else '  '} {' '*9} N/A  {sep_color}|{row_color}     {breakmodel.disk_blocks:3}  {sep_color}|{row_color}  (Disk cache){Colors.END}"
+            f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else '  '} {' '*9} N/A  {sep_color}|{row_color}     {self.breakmodel_config.disk_blocks:3}  {sep_color}|{row_color}  (Disk cache){Colors.END}"
        )
        print(
            f"{row_color}   {' '*9} N/A  {sep_color}|{row_color}     {n_layers:3}  {sep_color}|{row_color}  (CPU){Colors.END}"
        )

    def breakmodel_device_config(self, config):
-        # TODO: Find a better place for this or rework this
-
-        global breakmodel, generator
-        import breakmodel
-
        n_layers = utils.num_layers(config)

-        logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
+        logger.debug(
+            "gpu blocks before modification: {}".format(
+                self.breakmodel_config.gpu_blocks
+            )
+        )

        if utils.args.cpu:
-            breakmodel.gpu_blocks = [0] * n_layers
+            self.breakmodel_config.gpu_blocks = [0] * n_layers
            return

-        elif breakmodel.gpu_blocks == []:
+        elif self.breakmodel_config.gpu_blocks == []:
            logger.info("Breakmodel not specified, assuming GPU 0")
-            breakmodel.gpu_blocks = [n_layers]
+            self.breakmodel_config.gpu_blocks = [n_layers]
            n_layers = 0
-        
+
        else:
            s = n_layers
-            for i in range(len(breakmodel.gpu_blocks)):
-                if breakmodel.gpu_blocks[i] <= -1:
-                    breakmodel.gpu_blocks[i] = s
+            for i in range(len(self.breakmodel_config.gpu_blocks)):
+                if self.breakmodel_config.gpu_blocks[i] <= -1:
+                    self.breakmodel_config.gpu_blocks[i] = s
                    break
                else:
-                    s -= breakmodel.gpu_blocks[i]
-            assert sum(breakmodel.gpu_blocks) <= n_layers
-            n_layers -= sum(breakmodel.gpu_blocks)
-            if breakmodel.disk_blocks is not None:
-                assert breakmodel.disk_blocks <= n_layers
-                n_layers -= breakmodel.disk_blocks
+                    s -= self.breakmodel_config.gpu_blocks[i]
+            assert sum(self.breakmodel_config.gpu_blocks) <= n_layers
+            n_layers -= sum(self.breakmodel_config.gpu_blocks)
+            if self.breakmodel_config.disk_blocks is not None:
+                assert self.breakmodel_config.disk_blocks <= n_layers
+                n_layers -= self.breakmodel_config.disk_blocks

        logger.init_ok("Final device configuration:", status="Info")
-        self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
-        with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file:
-            file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks))
+        self.breakmodel_device_list(
+            n_layers, primary=self.breakmodel_config.primary_device
+        )
+        with open(
+            "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
+        ) as file:
+            file.write(
+                "{}\n{}".format(
+                    ",".join(map(str, self.breakmodel_config.gpu_blocks)),
+                    self.breakmodel_config.disk_blocks,
+                )
+            )

        # If all layers are on the same device, use the old GPU generation mode
-        while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
-            breakmodel.gpu_blocks.pop()
+        while (
+            len(self.breakmodel_config.gpu_blocks)
+            and self.breakmodel_config.gpu_blocks[-1] == 0
+        ):
+            self.breakmodel_config.gpu_blocks.pop()
        self.breakmodel = True
-        if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
+        if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[
+            -1
+        ] in (
            -1,
            utils.num_layers(config),
        ):
            logger.debug("All layers on same GPU. Breakmodel disabled")
            self.breakmodel = False
            self.usegpu = True
-            utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
+            utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1
            return

-        if not breakmodel.gpu_blocks:
+        if not self.breakmodel_config.gpu_blocks:
            logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
-            import breakmodel
-
-            breakmodel.primary_device = "cpu"
            self.breakmodel = False
            self.usegpu = False
            return