Hello its breaking breakmodel time

2025-06-05 21:59:24 +02:00 · 2023-05-27 16:31:53 -05:00
parent 97d2a78899
commit 1546b9efaa
8 changed files with 236 additions and 1097 deletions
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -9,6 +9,7 @@ import functools
 import itertools
 import traceback
 import contextlib
+from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model
 from tqdm.auto import tqdm
 from typing import Dict, List, Optional, Union

@@ -40,7 +41,6 @@ from modeling.inference_model import (
 )

 try:
-    import breakmodel
    import accelerate.utils
 except ModuleNotFoundError as e:
    if not utils.koboldai_vars.use_colab_tpu:
@@ -125,17 +125,6 @@ class HFTorchInferenceModel(HFInferenceModel):
        else:
            return "Unknown"

-    def get_auxilary_device(self):
-        """Get device auxilary tensors like inputs should be stored on."""
-
-        # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU.
-        if utils.koboldai_vars.hascuda and self.usegpu:
-            return utils.koboldai_vars.gpu_device
-        elif utils.koboldai_vars.hascuda and self.breakmodel:
-            import breakmodel
-            return breakmodel.primary_device
-        return "cpu"
-
    def _post_load(m_self) -> None:

        if not utils.koboldai_vars.model_type:
@@ -237,7 +226,7 @@ class HFTorchInferenceModel(HFInferenceModel):
        else:
            gen_in = prompt_tokens

-        device = self.get_auxilary_device()
+        device = utils.get_auxilary_device()
        gen_in = gen_in.to(device)

        additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []
@@ -254,8 +243,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                    len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
                ),
                repetition_penalty=1.0,
-                bad_words_ids=self.badwordsids
-                + additional_bad_words_ids,
+                bad_words_ids=self.badwordsids + additional_bad_words_ids,
                use_cache=True,
                num_return_sequences=batch_count,
            )
@@ -286,7 +274,27 @@ class HFTorchInferenceModel(HFInferenceModel):

        # Try to determine model type from either AutoModel or falling back to legacy
        try:
-            return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs)
+            model = AutoModelForCausalLM.from_config(self.model_config)
+
+            # load_checkpoint_in_model(
+            #     model.model,
+            #     location,
+            #     device_map=device_map
+            #     offload_folder="accelerate-disk-cache",
+            #     dtype="float16",
+            #     offload_state_dict=True
+            # )
+            # model.tie_weights()
+
+            device_map = infer_auto_device_map(
+                model,
+                max_memory={0: "10GiB", 1: "7GiB", "cpu": "15GiB"},
+                no_split_module_classes=["GPTJBlock"],
+            )
+
+            return AutoModelForCausalLM.from_pretrained(
+                location, device_map=device_map
+            )  # , **tf_kwargs)
        except Exception as e:
            traceback_string = traceback.format_exc().lower()

@@ -325,49 +333,6 @@ class HFTorchInferenceModel(HFInferenceModel):

        return True

-    def _move_to_devices(self) -> None:
-        for key, value in self.model.state_dict().items():
-            target_dtype = (
-                torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
-            )
-            if value.dtype is not target_dtype:
-                accelerate.utils.set_module_tensor_to_device(
-                    self.model,
-                    tensor_name=key,
-                    device=torch.device(value.device),
-                    value=value,
-                    dtype=target_dtype,
-                )
-
-        disk_blocks = breakmodel.disk_blocks
-        gpu_blocks = breakmodel.gpu_blocks
-        ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
-        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
-        device_map = {}
-
-        for name in utils.layers_module_names:
-            layer = int(name.rsplit(".", 1)[1])
-            device = (
-                ("disk" if layer < disk_blocks else "cpu")
-                if layer < ram_blocks
-                else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
-            )
-            device_map[name] = device
-
-        for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
-            device_map[name] = breakmodel.primary_device
-
-        breakmodel.dispatch_model_ex(
-            self.model,
-            device_map,
-            main_device=breakmodel.primary_device,
-            offload_buffers=True,
-            offload_dir="accelerate-disk-cache",
-        )
-
-        gc.collect()
-        return
-
    # Function to patch transformers to use our soft prompt
    def patch_embedding(self) -> None:
        if getattr(Embedding, "_koboldai_patch_causallm_model", None):
@@ -413,11 +378,10 @@ class HFTorchInferenceModel(HFInferenceModel):
        if not self.lazy_load:
            return

-
-        disk_blocks = breakmodel.disk_blocks
-        gpu_blocks = breakmodel.gpu_blocks
-        ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
-        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
+        # disk_blocks = breakmodel.disk_blocks
+        # gpu_blocks = breakmodel.gpu_blocks
+        # ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
+        # cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))

        def lazy_load_callback(
            model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]],
@@ -428,6 +392,7 @@ class HFTorchInferenceModel(HFInferenceModel):
            if lazy_load_callback.nested:
                return
            lazy_load_callback.nested = True
+            return

            device_map: Dict[str, Union[str, int]] = {}

@@ -458,8 +423,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                        utils.koboldai_vars.gpu_device
                        if utils.koboldai_vars.hascuda and self.usegpu
                        else "cpu"
-                        if not utils.koboldai_vars.hascuda
-                        or not self.breakmodel
+                        if not utils.koboldai_vars.hascuda or not self.breakmodel
                        else breakmodel.primary_device
                    )
                else:
@@ -479,8 +443,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                        else "disk"
                        if layer < disk_blocks and layer < ram_blocks
                        else "cpu"
-                        if not utils.koboldai_vars.hascuda
-                        or not self.breakmodel
+                        if not utils.koboldai_vars.hascuda or not self.breakmodel
                        else "shared"
                        if layer < ram_blocks
                        else bisect.bisect_right(
@@ -519,7 +482,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                    total=num_tensors,
                    desc="Loading model tensors",
                    file=utils.UIProgressBarFile(),
-                    position=1
+                    position=1,
                )

            if not is_safetensors:
@@ -550,7 +513,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                                    f.close()
                                ziproot = z.namelist()[0].split("/")[0]
                                f = z.open(f"{ziproot}/data/{storage_key}")
-                                
+
                                current_offset = 0
                            if current_offset != model_dict[key].seek_offset:
                                f.read(model_dict[key].seek_offset - current_offset)
@@ -574,7 +537,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                                )
                            )
                            # print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
-                            #logger.debug(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
+                            # logger.debug(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
                            model_dict[key] = model_dict[key].materialize(
                                f, map_location="cpu"
                            )
@@ -584,10 +547,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                                convert_to_float16
                                and breakmodel.primary_device != "cpu"
                                and utils.koboldai_vars.hascuda
-                                and (
-                                    self.breakmodel
-                                    or self.usegpu
-                                )
+                                and (self.breakmodel or self.usegpu)
                                and model_dict[key].dtype is torch.float32
                            ):
                                model_dict[key] = model_dict[key].to(torch.float16)
@@ -630,15 +590,11 @@ class HFTorchInferenceModel(HFInferenceModel):
                                        convert_to_float16
                                        and breakmodel.primary_device != "cpu"
                                        and utils.koboldai_vars.hascuda
-                                        and (
-                                            self.breakmodel
-                                            or self.usegpu
-                                        )
+                                        and (self.breakmodel or self.usegpu)
                                    ):
                                        dtype = torch.float16
                                    if breakmodel.primary_device == "cpu" or (
-                                        not self.usegpu
-                                        and not self.breakmodel
+                                        not self.usegpu and not self.breakmodel
                                    ):
                                        dtype = torch.float32
                                    if (
@@ -693,10 +649,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                            convert_to_float16
                            and breakmodel.primary_device != "cpu"
                            and utils.koboldai_vars.hascuda
-                            and (
-                                self.breakmodel
-                                or self.usegpu
-                            )
+                            and (self.breakmodel or self.usegpu)
                            and model_dict[key].dtype is torch.float32
                        ):
                            model_dict[key] = model_dict[key].to(torch.float16)
@@ -741,15 +694,11 @@ class HFTorchInferenceModel(HFInferenceModel):
                                    convert_to_float16
                                    and breakmodel.primary_device != "cpu"
                                    and utils.koboldai_vars.hascuda
-                                    and (
-                                        self.breakmodel
-                                        or self.usegpu
-                                    )
+                                    and (self.breakmodel or self.usegpu)
                                ):
                                    dtype = torch.float16
                                if breakmodel.primary_device == "cpu" or (
-                                    not self.usegpu
-                                    and not self.breakmodel
+                                    not self.usegpu and not self.breakmodel
                                ):
                                    dtype = torch.float32
                                if (
@@ -793,6 +742,7 @@ class HFTorchInferenceModel(HFInferenceModel):
            yield False

    def breakmodel_device_list(self, n_layers, primary=None, selected=None):
+        return
        # TODO: Find a better place for this or rework this

        device_count = torch.cuda.device_count()
@@ -824,6 +774,7 @@ class HFTorchInferenceModel(HFInferenceModel):

    def breakmodel_device_config(self, config):
        # TODO: Find a better place for this or rework this
+        return

        global breakmodel, generator
        import breakmodel
@@ -840,7 +791,7 @@ class HFTorchInferenceModel(HFInferenceModel):
            logger.info("Breakmodel not specified, assuming GPU 0")
            breakmodel.gpu_blocks = [n_layers]
            n_layers = 0
-        
+
        else:
            s = n_layers
            for i in range(len(breakmodel.gpu_blocks)):
@@ -857,8 +808,14 @@ class HFTorchInferenceModel(HFInferenceModel):

        logger.init_ok("Final device configuration:", status="Info")
        self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
-        with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file:
-            file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks))
+        with open(
+            "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
+        ) as file:
+            file.write(
+                "{}\n{}".format(
+                    ",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks
+                )
+            )

        # If all layers are on the same device, use the old GPU generation mode
        while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
@@ -876,9 +833,6 @@ class HFTorchInferenceModel(HFInferenceModel):

        if not breakmodel.gpu_blocks:
            logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
-            import breakmodel
-
-            breakmodel.primary_device = "cpu"
            self.breakmodel = False
            self.usegpu = False
            return