from __future__ import annotations

import gc
import os
import time
import bisect
import zipfile
import functools
import itertools
import traceback
import contextlib
from tqdm.auto import tqdm
from typing import Dict, List, Optional, Union

import torch
from torch.nn import Embedding
import transformers
from transformers import (
    StoppingCriteria,
    GPTNeoForCausalLM,
    GPT2LMHeadModel,
    AutoModelForCausalLM,
    LogitsProcessorList,
)

import utils
import modeling.lazy_loader as lazy_loader
from logger import logger, Colors

from modeling import warpers
from modeling.warpers import Warper
from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.inference_models.hf import HFInferenceModel
from modeling.inference_model import (
    GenerationResult,
    GenerationSettings,
    ModelCapabilities,
    use_core_manipulations,
)

try:
    import breakmodel
    import accelerate.utils
except ModuleNotFoundError as e:
    if not utils.koboldai_vars.use_colab_tpu:
        raise e

# When set to true, messages will appear in the console if samplers are not
# changing the scores. Keep in mind some samplers don't always change the
# scores for each token.
LOG_SAMPLER_NO_EFFECT = False


class HFTorchInferenceModel(HFInferenceModel):
    def __init__(
        self,
        model_name: str,
        lazy_load: bool,
        low_mem: bool,
    ) -> None:
        super().__init__(model_name)
        self.lazy_load = lazy_load
        self.low_mem = low_mem

        self.post_token_hooks = [
            PostTokenHooks.stream_tokens,
        ]

        self.stopper_hooks = [
            Stoppers.core_stopper,
            Stoppers.dynamic_wi_scanner,
            Stoppers.singleline_stopper,
            Stoppers.chat_mode_stopper,
            Stoppers.stop_sequence_stopper,
        ]

        self.capabilties = ModelCapabilities(
            embedding_manipulation=True,
            post_token_hooks=True,
            stopper_hooks=True,
            post_token_probs=True,
        )
        self._old_stopping_criteria = None

    def _apply_warpers(
        self, scores: torch.Tensor, input_ids: torch.Tensor
    ) -> torch.Tensor:
        warpers.update_settings()

        if LOG_SAMPLER_NO_EFFECT:
            pre = torch.Tensor(scores)

        for sid in utils.koboldai_vars.sampler_order:
            warper = Warper.from_id(sid)

            if not warper.value_is_valid():
                continue

            if warper == warpers.RepetitionPenalty:
                # Rep pen needs more data than other samplers
                scores = warper.torch(scores, input_ids=input_ids)
            else:
                scores = warper.torch(scores)

            assert scores is not None, f"Scores are None; warper '{warper}' is to blame"

            if LOG_SAMPLER_NO_EFFECT:
                if torch.equal(pre, scores):
                    logger.info(warper, "had no effect on the scores.")
                pre = torch.Tensor(scores)
        return scores

    def get_model_type(self) -> str:
        if not self.model_config:
            return "Read Only"

        if not isinstance(self.model_config, dict):
            return str(self.model_config.model_type)

        model_type = self.model_config.get("model_type")

        if model_type:
            return model_type

        if utils.koboldai_vars.mode.endswith("gpt2"):
            return "gpt2"
        else:
            return "Unknown"

    def _post_load(m_self) -> None:
        if not utils.koboldai_vars.model_type:
            utils.koboldai_vars.model_type = m_self.get_model_type()

        # Patch stopping_criteria
        class PTHStopper(StoppingCriteria):
            def __call__(
                hf_self,
                input_ids: torch.LongTensor,
                scores: torch.FloatTensor,
            ) -> None:
                m_self._post_token_gen(input_ids)

                for stopper in m_self.stopper_hooks:
                    do_stop = stopper(m_self, input_ids)
                    if do_stop:
                        return True
                return False

        old_gsc = transformers.GenerationMixin._get_stopping_criteria

        def _get_stopping_criteria(
            hf_self,
            *args,
            **kwargs,
        ):
            stopping_criteria = old_gsc(hf_self, *args, **kwargs)
            stopping_criteria.insert(0, PTHStopper())
            return stopping_criteria

        use_core_manipulations.get_stopping_criteria = _get_stopping_criteria

        # Patch logitswarpers

        def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList:
            processors = new_get_logits_processor.old_get_logits_processor(
                *args, **kwargs
            )
            return processors

        use_core_manipulations.get_logits_processor = new_get_logits_processor
        new_get_logits_processor.old_get_logits_processor = (
            transformers.GenerationMixin._get_logits_processor
        )

        class KoboldLogitsWarperList(LogitsProcessorList):
            def __init__(self):
                pass

            def __call__(
                lw_self,
                input_ids: torch.LongTensor,
                scores: torch.FloatTensor,
                *args,
                **kwargs,
            ):
                scores = m_self._apply_warpers(scores=scores, input_ids=input_ids)

                for processor in m_self.logits_processors:
                    scores = processor(m_self, scores=scores, input_ids=input_ids)
                    assert (
                        scores is not None
                    ), f"Scores are None; processor '{processor}' is to blame"
                return scores

        def new_get_logits_warper(
            beams: int = 1,
        ) -> LogitsProcessorList:
            return KoboldLogitsWarperList()

        def new_sample(self, *args, **kwargs):
            assert kwargs.pop("logits_warper", None) is not None
            kwargs["logits_warper"] = new_get_logits_warper(
                beams=1,
            )
            if utils.koboldai_vars.newlinemode in ["s", "ns"]:
                kwargs["eos_token_id"] = -1
                kwargs.setdefault("pad_token_id", 2)
            return new_sample.old_sample(self, *args, **kwargs)

        new_sample.old_sample = transformers.GenerationMixin.sample
        use_core_manipulations.sample = new_sample

        # PEFT Loading. This MUST be done after all save_pretrained calls are
        # finished on the main model.
        if utils.args.peft:
            from peft import PeftModel, PeftConfig
            local_peft_dir = os.path.join(m_self.get_local_model_path(), "peft")

            # Make PEFT dir if it doesn't exist
            try:
                os.makedirs(local_peft_dir)
            except FileExistsError:
                pass

            peft_local_path = os.path.join(local_peft_dir, utils.args.peft.replace("/", "_"))
            logger.debug(f"Loading PEFT '{utils.args.peft}', possible local path is '{peft_local_path}'.")

            peft_installed_locally = True
            possible_peft_locations = [peft_local_path, utils.args.peft]

            for i, location in enumerate(possible_peft_locations):
                try:
                    m_self.model = PeftModel.from_pretrained(m_self.model, location)
                    logger.debug(f"Loaded PEFT at '{location}'")
                    break
                except ValueError:
                    peft_installed_locally = False
                    if i == len(possible_peft_locations) - 1:
                        raise RuntimeError(f"Unable to load PeftModel for given name '{utils.args.peft}'. Does it exist?")
                except RuntimeError:
                    raise RuntimeError("Error while loading PeftModel. Are you using the correct model?")

            if not peft_installed_locally:
                logger.debug(f"PEFT not saved to models folder; saving to '{peft_local_path}'")
                m_self.model.save_pretrained(peft_local_path)

        return super()._post_load()

    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
        max_new: int,
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
        seed: Optional[int] = None,
        **kwargs,
    ) -> GenerationResult:
        if not isinstance(prompt_tokens, torch.Tensor):
            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
        else:
            gen_in = prompt_tokens

        device = utils.get_auxilary_device()
        gen_in = gen_in.to(device)

        additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []

        if seed is not None:
            torch.manual_seed(seed)

        with torch.no_grad():
            start_time = time.time()

            # HEED & BEWARE: All arguments passed to self.model.generate MUST be
            # kwargs; see https://github.com/huggingface/peft/issues/232. If they
            # aren't, PeftModel will EXPLODE!!!! But nothing will happen without
            # a PEFT loaded so it's sneaky.
            genout = self.model.generate(
                input_ids=gen_in,
                do_sample=True,
                max_length=min(
                    len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
                ),
                repetition_penalty=1.0,
                bad_words_ids=utils.koboldai_vars.badwordsids
                + additional_bad_words_ids,
                use_cache=True,
                num_return_sequences=batch_count,
            )
        logger.debug(
            "torch_raw_generate: run generator {}s".format(time.time() - start_time)
        )

        return GenerationResult(
            self,
            out_batches=genout,
            prompt=prompt_tokens,
            is_whole_generation=False,
            output_includes_prompt=True,
        )

    def _get_model(self, location: str, tf_kwargs: Dict):
        tf_kwargs["revision"] = utils.koboldai_vars.revision
        tf_kwargs["cache_dir"] = "cache"
        tf_kwargs["trust_remote_code"] = utils.koboldai_vars.trust_remote_code

        # If we have model hints for legacy model, use them rather than fall back.
        try:
            if self.model_name == "GPT2Custom":
                return GPT2LMHeadModel.from_pretrained(location, **tf_kwargs)
            elif self.model_name == "NeoCustom":
                return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)
        except Exception as e:
            logger.warning(f"{self.model_name} is a no-go; {e} - Falling back to auto.")

        # Try to determine model type from either AutoModel or falling back to legacy
        try:
            return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs)
        except Exception as e:
            traceback_string = traceback.format_exc().lower()

            if "out of memory" in traceback_string:
                raise RuntimeError(
                    "One of your GPUs ran out of memory when KoboldAI tried to load your model."
                )

            # Model corrupted or serious loading problem. Stop here.
            if "invalid load key" in traceback_string:
                logger.error("Invalid load key! Aborting.")
                raise

            logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
            try:
                return GPT2LMHeadModel.from_pretrained(location, **tf_kwargs)
            except Exception as e:
                logger.warning(f"Fell back to GPTNeoForCausalLM due to {e}")
                return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)

    def get_hidden_size(self) -> int:
        return self.model.get_input_embeddings().embedding_dim

    def _will_load_with_safetensors(self) -> bool:
        path = self.get_local_model_path()

        # TODO: This might mess up download to run
        if not path:
            return False

        if not os.path.exists(os.path.join(path, "model.safetensors")):
            return False

        return True

    def _move_to_devices(self) -> None:
        for key, value in self.model.state_dict().items():
            target_dtype = (
                torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
            )
            if value.dtype is not target_dtype:
                accelerate.utils.set_module_tensor_to_device(
                    self.model,
                    tensor_name=key,
                    device=torch.device(value.device),
                    value=value,
                    dtype=target_dtype,
                )

        disk_blocks = breakmodel.disk_blocks
        gpu_blocks = breakmodel.gpu_blocks
        ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
        device_map = {}

        for name in utils.layers_module_names:
            layer = int(name.rsplit(".", 1)[1])
            device = (
                ("disk" if layer < disk_blocks else "cpu")
                if layer < ram_blocks
                else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
            )
            device_map[name] = device

        for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
            device_map[name] = breakmodel.primary_device

        breakmodel.dispatch_model_ex(
            self.model,
            device_map,
            main_device=breakmodel.primary_device,
            offload_buffers=True,
            offload_dir="accelerate-disk-cache",
        )

        gc.collect()
        return

    # Function to patch transformers to use our soft prompt
    def patch_embedding(self) -> None:
        if getattr(Embedding, "_koboldai_patch_causallm_model", None):
            Embedding._koboldai_patch_causallm_model = self.model
            return

        old_embedding_call = Embedding.__call__

        kai_model = self

        def new_embedding_call(self, input_ids, *args, **kwargs):
            # Don't touch embeddings for models other than the core inference model (that's us!)
            if (
                Embedding._koboldai_patch_causallm_model.get_input_embeddings()
                is not self
            ):
                return old_embedding_call(self, input_ids, *args, **kwargs)

            assert input_ids is not None

            if utils.koboldai_vars.sp is not None:
                shifted_input_ids = input_ids - kai_model.model.config.vocab_size

            input_ids.clamp_(max=kai_model.model.config.vocab_size - 1)
            inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs)

            if utils.koboldai_vars.sp is not None:
                utils.koboldai_vars.sp = utils.koboldai_vars.sp.to(
                    inputs_embeds.dtype
                ).to(inputs_embeds.device)
                inputs_embeds = torch.where(
                    (shifted_input_ids >= 0)[..., None],
                    utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)],
                    inputs_embeds,
                )

            return inputs_embeds

        Embedding.__call__ = new_embedding_call
        Embedding._koboldai_patch_causallm_model = self.model

    def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
        if not self.lazy_load:
            return

        if utils.args.breakmodel_disklayers is not None:
            breakmodel.disk_blocks = utils.args.breakmodel_disklayers

        disk_blocks = breakmodel.disk_blocks
        gpu_blocks = breakmodel.gpu_blocks
        ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))

        def lazy_load_callback(
            model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]],
            f,
            is_safetensors: bool = False,
            **_,
        ):
            if lazy_load_callback.nested:
                return
            lazy_load_callback.nested = True

            device_map: Dict[str, Union[str, int]] = {}

            @functools.lru_cache(maxsize=None)
            def get_original_key(key):
                return max(
                    (
                        original_key
                        for original_key in utils.module_names
                        if original_key.endswith(key)
                    ),
                    key=len,
                )

            for key, value in model_dict.items():
                original_key = get_original_key(key)

                if isinstance(value, lazy_loader.LazyTensor) and not any(
                    original_key.startswith(n) for n in utils.layers_module_names
                ):
                    device_map[key] = (
                        utils.koboldai_vars.gpu_device
                        if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
                        else "cpu"
                        if not utils.koboldai_vars.hascuda
                        or not utils.koboldai_vars.breakmodel
                        else breakmodel.primary_device
                    )
                else:
                    layer = int(
                        max(
                            (
                                n
                                for n in utils.layers_module_names
                                if original_key.startswith(n)
                            ),
                            key=len,
                        ).rsplit(".", 1)[1]
                    )
                    device = (
                        utils.koboldai_vars.gpu_device
                        if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
                        else "disk"
                        if layer < disk_blocks and layer < ram_blocks
                        else "cpu"
                        if not utils.koboldai_vars.hascuda
                        or not utils.koboldai_vars.breakmodel
                        else "shared"
                        if layer < ram_blocks
                        else bisect.bisect_right(
                            cumulative_gpu_blocks, layer - ram_blocks
                        )
                    )
                    device_map[key] = device

            if utils.num_shards is None or utils.current_shard == 0:
                utils.offload_index = {}
                if os.path.isdir("accelerate-disk-cache"):
                    # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
                    # (the folder doesn't contain any subfolders so os.remove will do just fine)
                    for filename in os.listdir("accelerate-disk-cache"):
                        try:
                            os.remove(os.path.join("accelerate-disk-cache", filename))
                        except OSError:
                            pass
                os.makedirs("accelerate-disk-cache", exist_ok=True)
                if utils.num_shards is not None:
                    num_tensors = len(
                        utils.get_sharded_checkpoint_num_tensors(
                            utils.from_pretrained_model_name,
                            utils.from_pretrained_index_filename,
                            is_safetensors=is_safetensors,
                            **utils.from_pretrained_kwargs,
                        )
                    )
                else:
                    num_tensors = len(device_map)
                print(flush=True)
                utils.koboldai_vars.status_message = "Loading model"
                utils.koboldai_vars.total_layers = num_tensors
                utils.koboldai_vars.loaded_layers = 0
                utils.bar = tqdm(
                    total=num_tensors,
                    desc="Loading model tensors",
                    file=utils.UIProgressBarFile(),
                    position=1
                )

            if not is_safetensors:
                # Torch lazyload
                with zipfile.ZipFile(f, "r") as z:
                    try:
                        last_storage_key = None
                        zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0]
                        f = None
                        current_offset = 0
                        able_to_pin_layers = True
                        if utils.num_shards is not None:
                            utils.current_shard += 1
                        for key in sorted(
                            device_map.keys(),
                            key=lambda k: (
                                model_dict[k].key,
                                model_dict[k].seek_offset,
                            ),
                        ):
                            storage_key = model_dict[key].key
                            if (
                                storage_key != last_storage_key
                                or model_dict[key].seek_offset < current_offset
                            ):
                                last_storage_key = storage_key
                                if isinstance(f, zipfile.ZipExtFile):
                                    f.close()
                                try:
                                    f = z.open(f"archive/data/{storage_key}")
                                except:
                                    f = z.open(f"{zipfolder}/data/{storage_key}")
                                current_offset = 0
                            if current_offset != model_dict[key].seek_offset:
                                f.read(model_dict[key].seek_offset - current_offset)
                                current_offset = model_dict[key].seek_offset
                            device = device_map[key]
                            size = functools.reduce(
                                lambda x, y: x * y, model_dict[key].shape, 1
                            )
                            dtype = model_dict[key].dtype
                            nbytes = (
                                size
                                if dtype is torch.bool
                                else size
                                * (
                                    (
                                        torch.finfo
                                        if dtype.is_floating_point
                                        else torch.iinfo
                                    )(dtype).bits
                                    >> 3
                                )
                            )
                            # print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                            model_dict[key] = model_dict[key].materialize(
                                f, map_location="cpu"
                            )
                            if model_dict[key].dtype is torch.float32:
                                utils.koboldai_vars.fp32_model = True
                            if (
                                convert_to_float16
                                and breakmodel.primary_device != "cpu"
                                and utils.koboldai_vars.hascuda
                                and (
                                    utils.koboldai_vars.breakmodel
                                    or utils.koboldai_vars.usegpu
                                )
                                and model_dict[key].dtype is torch.float32
                            ):
                                model_dict[key] = model_dict[key].to(torch.float16)
                            if breakmodel.primary_device == "cpu" or (
                                not utils.koboldai_vars.usegpu
                                and not utils.koboldai_vars.breakmodel
                                and model_dict[key].dtype is torch.float16
                            ):
                                model_dict[key] = model_dict[key].to(torch.float32)
                            if device == "shared":
                                model_dict[key] = model_dict[key].to("cpu").detach_()
                                if able_to_pin_layers:
                                    try:
                                        model_dict[key] = model_dict[key].pin_memory()
                                    except:
                                        able_to_pin_layers = False
                            elif device == "disk":
                                accelerate.utils.offload_weight(
                                    model_dict[key],
                                    get_original_key(key),
                                    "accelerate-disk-cache",
                                    index=utils.offload_index,
                                )
                                model_dict[key] = model_dict[key].to("meta")
                            else:
                                model_dict[key] = model_dict[key].to(device)
                            # print("OK", flush=True)
                            current_offset += nbytes
                            utils.bar.update(1)
                            utils.koboldai_vars.loaded_layers += 1
                    finally:
                        if (
                            utils.num_shards is None
                            or utils.current_shard >= utils.num_shards
                        ):
                            if utils.offload_index:
                                for name, tensor in utils.named_buffers:
                                    dtype = tensor.dtype
                                    if (
                                        convert_to_float16
                                        and breakmodel.primary_device != "cpu"
                                        and utils.koboldai_vars.hascuda
                                        and (
                                            utils.koboldai_vars.breakmodel
                                            or utils.koboldai_vars.usegpu
                                        )
                                    ):
                                        dtype = torch.float16
                                    if breakmodel.primary_device == "cpu" or (
                                        not utils.koboldai_vars.usegpu
                                        and not utils.koboldai_vars.breakmodel
                                    ):
                                        dtype = torch.float32
                                    if (
                                        name in model_dict
                                        and model_dict[name].dtype is not dtype
                                    ):
                                        model_dict[name] = model_dict[name].to(dtype)
                                    if tensor.dtype is not dtype:
                                        tensor = tensor.to(dtype)
                                    if name not in utils.offload_index:
                                        accelerate.utils.offload_weight(
                                            tensor,
                                            name,
                                            "accelerate-disk-cache",
                                            index=utils.offload_index,
                                        )
                                accelerate.utils.save_offload_index(
                                    utils.offload_index, "accelerate-disk-cache"
                                )
                            utils.bar.close()
                            utils.bar = None
                            utils.koboldai_vars.status_message = ""
                        lazy_load_callback.nested = False
                        if isinstance(f, zipfile.ZipExtFile):
                            f.close()
            else:
                # Loading with safetensors
                try:
                    able_to_pin_layers = True

                    if utils.num_shards is not None:
                        utils.current_shard += 1

                    for key in sorted(
                        device_map.keys(),
                        key=lambda k: model_dict[k].key,
                    ):
                        storage_key = model_dict[key].key

                        device = device_map[key]

                        # print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)

                        model_dict[key] = model_dict[key].materialize(
                            f, map_location="cpu"
                        )

                        if model_dict[key].dtype is torch.float32:
                            utils.koboldai_vars.fp32_model = True

                        if (
                            convert_to_float16
                            and breakmodel.primary_device != "cpu"
                            and utils.koboldai_vars.hascuda
                            and (
                                utils.koboldai_vars.breakmodel
                                or utils.koboldai_vars.usegpu
                            )
                            and model_dict[key].dtype is torch.float32
                        ):
                            model_dict[key] = model_dict[key].to(torch.float16)

                        if breakmodel.primary_device == "cpu" or (
                            not utils.koboldai_vars.usegpu
                            and not utils.koboldai_vars.breakmodel
                            and model_dict[key].dtype is torch.float16
                        ):
                            model_dict[key] = model_dict[key].to(torch.float32)

                        if device == "shared":
                            model_dict[key] = model_dict[key].to("cpu").detach_()
                            if able_to_pin_layers:
                                try:
                                    model_dict[key] = model_dict[key].pin_memory()
                                except:
                                    able_to_pin_layers = False
                        elif device == "disk":
                            accelerate.utils.offload_weight(
                                model_dict[key],
                                get_original_key(key),
                                "accelerate-disk-cache",
                                index=utils.offload_index,
                            )
                            model_dict[key] = model_dict[key].to("meta")
                        else:
                            model_dict[key] = model_dict[key].to(device)

                        utils.bar.update(1)
                        utils.koboldai_vars.loaded_layers += 1

                finally:
                    if (
                        utils.num_shards is None
                        or utils.current_shard >= utils.num_shards
                    ):
                        if utils.offload_index:
                            for name, tensor in utils.named_buffers:
                                dtype = tensor.dtype
                                if (
                                    convert_to_float16
                                    and breakmodel.primary_device != "cpu"
                                    and utils.koboldai_vars.hascuda
                                    and (
                                        utils.koboldai_vars.breakmodel
                                        or utils.koboldai_vars.usegpu
                                    )
                                ):
                                    dtype = torch.float16
                                if breakmodel.primary_device == "cpu" or (
                                    not utils.koboldai_vars.usegpu
                                    and not utils.koboldai_vars.breakmodel
                                ):
                                    dtype = torch.float32
                                if (
                                    name in model_dict
                                    and model_dict[name].dtype is not dtype
                                ):
                                    model_dict[name] = model_dict[name].to(dtype)
                                if tensor.dtype is not dtype:
                                    tensor = tensor.to(dtype)
                                if name not in utils.offload_index:
                                    accelerate.utils.offload_weight(
                                        tensor,
                                        name,
                                        "accelerate-disk-cache",
                                        index=utils.offload_index,
                                    )
                            accelerate.utils.save_offload_index(
                                utils.offload_index, "accelerate-disk-cache"
                            )
                        utils.bar.close()
                        utils.bar = None
                        utils.koboldai_vars.status_message = ""

                    lazy_load_callback.nested = False

        lazy_load_callback.nested = False
        return lazy_load_callback

    @contextlib.contextmanager
    def _maybe_use_float16(self, always_use: bool = False):
        if always_use or (
            utils.koboldai_vars.hascuda
            and self.low_mem
            and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel)
        ):
            original_dtype = torch.get_default_dtype()
            torch.set_default_dtype(torch.float16)
            yield True
            torch.set_default_dtype(original_dtype)
        else:
            yield False

    def breakmodel_device_list(self, n_layers, primary=None, selected=None):
        # TODO: Find a better place for this or rework this

        device_count = torch.cuda.device_count()
        if device_count < 2:
            primary = None
        gpu_blocks = breakmodel.gpu_blocks + (
            device_count - len(breakmodel.gpu_blocks)
        ) * [0]
        print(f"{Colors.YELLOW}       DEVICE ID  |  LAYERS  |  DEVICE NAME{Colors.END}")
        for i in range(device_count):
            name = torch.cuda.get_device_name(i)
            if len(name) > 47:
                name = "..." + name[-44:]
            row_color = Colors.END
            sep_color = Colors.YELLOW
            print(
                f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else '  '} {'(primary)' if i == primary else ' '*9} {i:3}  {sep_color}|{row_color}     {gpu_blocks[i]:3}  {sep_color}|{row_color}  {name}{Colors.END}"
            )
        row_color = Colors.END
        sep_color = Colors.YELLOW
        print(
            f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else '  '} {' '*9} N/A  {sep_color}|{row_color}     {breakmodel.disk_blocks:3}  {sep_color}|{row_color}  (Disk cache){Colors.END}"
        )
        print(
            f"{row_color}   {' '*9} N/A  {sep_color}|{row_color}     {n_layers:3}  {sep_color}|{row_color}  (CPU){Colors.END}"
        )

    def breakmodel_device_config(self, config):
        # TODO: Find a better place for this or rework this

        global breakmodel, generator
        import breakmodel

        n_layers = utils.num_layers(config)

        if utils.args.cpu:
            breakmodel.gpu_blocks = [0] * n_layers
            return

        elif (
            utils.args.breakmodel_gpulayers is not None
            or utils.args.breakmodel_disklayers is not None
        ):
            try:
                if not utils.args.breakmodel_gpulayers:
                    breakmodel.gpu_blocks = []
                else:
                    breakmodel.gpu_blocks = list(
                        map(int, utils.args.breakmodel_gpulayers.split(","))
                    )
                assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
                s = n_layers
                for i in range(len(breakmodel.gpu_blocks)):
                    if breakmodel.gpu_blocks[i] <= -1:
                        breakmodel.gpu_blocks[i] = s
                        break
                    else:
                        s -= breakmodel.gpu_blocks[i]
                assert sum(breakmodel.gpu_blocks) <= n_layers
                n_layers -= sum(breakmodel.gpu_blocks)
                if utils.args.breakmodel_disklayers is not None:
                    assert utils.args.breakmodel_disklayers <= n_layers
                    breakmodel.disk_blocks = utils.args.breakmodel_disklayers
                    n_layers -= utils.args.breakmodel_disklayers
            except:
                logger.warning(
                    "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
                )
                breakmodel.gpu_blocks = [n_layers]
                n_layers = 0
        elif utils.args.breakmodel_layers is not None:
            breakmodel.gpu_blocks = [
                n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
            ]
            n_layers -= sum(breakmodel.gpu_blocks)
        elif utils.args.model is not None:
            logger.info("Breakmodel not specified, assuming GPU 0")
            breakmodel.gpu_blocks = [n_layers]
            n_layers = 0
        else:
            device_count = torch.cuda.device_count()
            if device_count > 1:
                print(
                    Colors.CYAN
                    + "\nPlease select one of your GPUs to be your primary GPU."
                )
                print(
                    "VRAM usage in your primary GPU will be higher than for your other ones."
                )
                print("It is recommended you make your fastest GPU your primary GPU.")
                self.breakmodel_device_list(n_layers)
                while True:
                    primaryselect = input("device ID> ")
                    if (
                        primaryselect.isnumeric()
                        and 0 <= int(primaryselect) < device_count
                    ):
                        breakmodel.primary_device = int(primaryselect)
                        break
                    else:
                        print(
                            f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
                        )
            else:
                breakmodel.primary_device = 0

            print(
                Colors.PURPLE
                + "\nIf you don't have enough VRAM to run the model on a single GPU"
            )
            print(
                "you can split the model between your CPU and your GPU(s), or between"
            )
            print("multiple GPUs if you have more than one.")
            print("By putting more 'layers' on a GPU or CPU, more computations will be")
            print(
                "done on that device and more VRAM or RAM will be required on that device"
            )
            print("(roughly proportional to number of layers).")
            print(
                "It should be noted that GPUs are orders of magnitude faster than the CPU."
            )
            print(
                f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
            )

            for i in range(device_count):
                self.breakmodel_device_list(
                    n_layers, primary=breakmodel.primary_device, selected=i
                )
                print(
                    f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
                )
                while True:
                    layerselect = input("# of layers> ")
                    if (
                        layerselect.isnumeric() or layerselect.strip() == "-1"
                    ) and -1 <= int(layerselect) <= n_layers:
                        layerselect = int(layerselect)
                        layerselect = n_layers if layerselect == -1 else layerselect
                        breakmodel.gpu_blocks.append(layerselect)
                        n_layers -= layerselect
                        break
                    else:
                        print(
                            f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
                        )
                if n_layers == 0:
                    break

            if n_layers > 0:
                self.breakmodel_device_list(
                    n_layers, primary=breakmodel.primary_device, selected=-1
                )
                print(
                    f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
                )
                while True:
                    layerselect = input("# of layers> ")
                    if (
                        layerselect.isnumeric() or layerselect.strip() == "-1"
                    ) and -1 <= int(layerselect) <= n_layers:
                        layerselect = int(layerselect)
                        layerselect = n_layers if layerselect == -1 else layerselect
                        breakmodel.disk_blocks = layerselect
                        n_layers -= layerselect
                        break
                    else:
                        print(
                            f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
                        )

        logger.init_ok("Final device configuration:", status="Info")
        self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)

        # If all layers are on the same device, use the old GPU generation mode
        while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
            breakmodel.gpu_blocks.pop()
        if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
            -1,
            utils.num_layers(config),
        ):
            utils.koboldai_vars.breakmodel = False
            utils.koboldai_vars.usegpu = True
            utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
            return

        if not breakmodel.gpu_blocks:
            logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
            import breakmodel

            breakmodel.primary_device = "cpu"
            utils.koboldai_vars.breakmodel = False
            utils.koboldai_vars.usegpu = False
            return