Maybe works now...?

2025-06-05 21:59:24 +02:00 · 2023-05-31 14:31:08 -05:00
parent d0d215bb37
commit 24b0b32829
3 changed files with 239 additions and 547 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -71,6 +71,9 @@ class model_backend(HFTorchInferenceModel):
        )

        if self.lazy_load:
+            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
+            tf_kwargs.pop("low_cpu_mem_usage", None)
+
            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
            with lazy_loader.use_lazy_load(dematerialized_modules=True):
                try:
@@ -83,145 +86,92 @@ class model_backend(HFTorchInferenceModel):
                    self.lazy_load = False

        # Download model from Huggingface if it does not exist, otherwise load locally
-        with self._maybe_use_float16(), lazy_loader.use_lazy_load(
-            enable=self.lazy_load,
-            callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
-            if self.lazy_load
-            else None,
-            dematerialized_modules=True,
-        ):
-            if self.lazy_load:
-                # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
-                tf_kwargs.pop("low_cpu_mem_usage", None)
+        if self.get_local_model_path():
+            # Model is stored locally, load it.
+            self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+            self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+        else:
+            # Model not stored locally, we need to download it.

-            if self.get_local_model_path():
-                # Model is stored locally, load it.
-                self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
-                self.tokenizer = self._get_tokenizer(self.get_local_model_path())
-            else:
-                # Model not stored locally, we need to download it.
+            # _rebuild_tensor patch for casting dtype and supporting LazyTensors
+            old_rebuild_tensor = torch._utils._rebuild_tensor

-                # _rebuild_tensor patch for casting dtype and supporting LazyTensors
-                old_rebuild_tensor = torch._utils._rebuild_tensor
+            def new_rebuild_tensor(
+                storage: Union[lazy_loader.LazyTensor, torch.Storage],
+                storage_offset,
+                shape,
+                stride,
+            ):
+                if not isinstance(storage, lazy_loader.LazyTensor):
+                    dtype = storage.dtype
+                else:
+                    dtype = storage.storage_type.dtype
+                    if not isinstance(dtype, torch.dtype):
+                        dtype = storage.storage_type(0).dtype
+                if dtype is torch.float32 and len(shape) >= 2:
+                    utils.koboldai_vars.fp32_model = True
+                return old_rebuild_tensor(storage, storage_offset, shape, stride)

-                def new_rebuild_tensor(
-                    storage: Union[lazy_loader.LazyTensor, torch.Storage],
-                    storage_offset,
-                    shape,
-                    stride,
-                ):
-                    if not isinstance(storage, lazy_loader.LazyTensor):
-                        dtype = storage.dtype
-                    else:
-                        dtype = storage.storage_type.dtype
-                        if not isinstance(dtype, torch.dtype):
-                            dtype = storage.storage_type(0).dtype
-                    if dtype is torch.float32 and len(shape) >= 2:
-                        utils.koboldai_vars.fp32_model = True
-                    return old_rebuild_tensor(storage, storage_offset, shape, stride)
+            torch._utils._rebuild_tensor = new_rebuild_tensor
+            self.model = self._get_model(self.model_name, tf_kwargs)
+            self.tokenizer = self._get_tokenizer(self.model_name)
+            torch._utils._rebuild_tensor = old_rebuild_tensor

-                torch._utils._rebuild_tensor = new_rebuild_tensor
-                self.model = self._get_model(self.model_name, tf_kwargs)
-                self.tokenizer = self._get_tokenizer(self.model_name)
-                torch._utils._rebuild_tensor = old_rebuild_tensor
+            if save_model:
+                self.tokenizer.save_pretrained(
+                    self.get_local_model_path(ignore_existance=True)
+                )

-                if save_model:
-                    self.tokenizer.save_pretrained(
-                        self.get_local_model_path(ignore_existance=True)
+                if utils.koboldai_vars.fp32_model:
+                    # Use save_pretrained to convert fp32 models to fp16,
+                    # unless we are using disk cache because save_pretrained
+                    # is not supported in that case
+                    self.model = self.model.half()
+                    self.model.save_pretrained(
+                        self.get_local_model_path(ignore_existance=True),
+                        max_shard_size="500MiB",
                    )

-                    if utils.koboldai_vars.fp32_model:
-                        # Use save_pretrained to convert fp32 models to fp16,
-                        # unless we are using disk cache because save_pretrained
-                        # is not supported in that case
-                        self.model = self.model.half()
-                        self.model.save_pretrained(
-                            self.get_local_model_path(ignore_existance=True),
-                            max_shard_size="500MiB",
-                        )
+                else:
+                    # For fp16 models, we can just copy the model files directly
+                    import transformers.configuration_utils
+                    import transformers.modeling_utils
+                    import transformers.file_utils
+                    import huggingface_hub

-                    else:
-                        # For fp16 models, we can just copy the model files directly
-                        import transformers.configuration_utils
-                        import transformers.modeling_utils
-                        import transformers.file_utils
-                        import huggingface_hub
-
-                        # Save the config.json
-                        shutil.move(
-                            os.path.realpath(
-                                huggingface_hub.hf_hub_download(
-                                    self.model_name,
-                                    transformers.configuration_utils.CONFIG_NAME,
-                                    revision=utils.koboldai_vars.revision,
-                                    cache_dir="cache",
-                                    local_files_only=True,
-                                    legacy_cache_layout=False,
-                                )
-                            ),
-                            os.path.join(
-                                self.get_local_model_path(ignore_existance=True),
+                    # Save the config.json
+                    shutil.move(
+                        os.path.realpath(
+                            huggingface_hub.hf_hub_download(
+                                self.model_name,
                                transformers.configuration_utils.CONFIG_NAME,
-                            ),
-                        )
-
-                        if utils.num_shards is None:
-                            # Save the pytorch_model.bin or model.safetensors of an unsharded model
-                            any_success = False
-                            possible_checkpoint_names = [
-                                transformers.modeling_utils.WEIGHTS_NAME,
-                                "model.safetensors",
-                            ]
-
-                            for possible_checkpoint_name in possible_checkpoint_names:
-                                try:
-                                    shutil.move(
-                                        os.path.realpath(
-                                            huggingface_hub.hf_hub_download(
-                                                self.model_name,
-                                                possible_checkpoint_name,
-                                                revision=utils.koboldai_vars.revision,
-                                                cache_dir="cache",
-                                                local_files_only=True,
-                                                legacy_cache_layout=False,
-                                            )
-                                        ),
-                                        os.path.join(
-                                            self.get_local_model_path(
-                                                ignore_existance=True
-                                            ),
-                                            possible_checkpoint_name,
-                                        ),
-                                    )
-                                    any_success = True
-                                except Exception:
-                                    pass
-
-                            if not any_success:
-                                raise RuntimeError(
-                                    f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
-                                )
-                        else:
-                            # Handle saving sharded models
-
-                            with open(utils.from_pretrained_index_filename) as f:
-                                map_data = json.load(f)
-                            filenames = set(map_data["weight_map"].values())
-                            # Save the pytorch_model.bin.index.json of a sharded model
-                            shutil.move(
-                                os.path.realpath(utils.from_pretrained_index_filename),
-                                os.path.join(
-                                    self.get_local_model_path(ignore_existance=True),
-                                    transformers.modeling_utils.WEIGHTS_INDEX_NAME,
-                                ),
+                                revision=utils.koboldai_vars.revision,
+                                cache_dir="cache",
+                                local_files_only=True,
+                                legacy_cache_layout=False,
                            )
-                            # Then save the pytorch_model-#####-of-#####.bin files
-                            for filename in filenames:
+                        ),
+                        os.path.join(
+                            self.get_local_model_path(ignore_existance=True),
+                            transformers.configuration_utils.CONFIG_NAME,
+                        ),
+                    )
+
+                    if utils.num_shards is None:
+                        # Save the pytorch_model.bin or model.safetensors of an unsharded model
+                        any_success = False
+                        possible_checkpoint_names = [
+                            transformers.modeling_utils.WEIGHTS_NAME,
+                            "model.safetensors",
+                        ]
+
+                        for possible_checkpoint_name in possible_checkpoint_names:
+                            try:
                                shutil.move(
                                    os.path.realpath(
                                        huggingface_hub.hf_hub_download(
                                            self.model_name,
-                                            filename,
+                                            possible_checkpoint_name,
                                            revision=utils.koboldai_vars.revision,
                                            cache_dir="cache",
                                            local_files_only=True,
@@ -232,13 +182,53 @@ class model_backend(HFTorchInferenceModel):
                                        self.get_local_model_path(
                                            ignore_existance=True
                                        ),
-                                        filename,
+                                        possible_checkpoint_name,
                                    ),
                                )
-                    shutil.rmtree("cache/")
+                                any_success = True
+                            except Exception:
+                                pass
+
+                        if not any_success:
+                            raise RuntimeError(
+                                f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
+                            )
+                    else:
+                        # Handle saving sharded models
+
+                        with open(utils.from_pretrained_index_filename) as f:
+                            map_data = json.load(f)
+                        filenames = set(map_data["weight_map"].values())
+                        # Save the pytorch_model.bin.index.json of a sharded model
+                        shutil.move(
+                            os.path.realpath(utils.from_pretrained_index_filename),
+                            os.path.join(
+                                self.get_local_model_path(ignore_existance=True),
+                                transformers.modeling_utils.WEIGHTS_INDEX_NAME,
+                            ),
+                        )
+                        # Then save the pytorch_model-#####-of-#####.bin files
+                        for filename in filenames:
+                            shutil.move(
+                                os.path.realpath(
+                                    huggingface_hub.hf_hub_download(
+                                        self.model_name,
+                                        filename,
+                                        revision=utils.koboldai_vars.revision,
+                                        cache_dir="cache",
+                                        local_files_only=True,
+                                        legacy_cache_layout=False,
+                                    )
+                                ),
+                                os.path.join(
+                                    self.get_local_model_path(ignore_existance=True),
+                                    filename,
+                                ),
+                            )
+                shutil.rmtree("cache/")

        self.patch_embedding()
-        self.model.tie_weights()
+        # self.model.tie_weights()

        self.model.kai_model = self
        utils.koboldai_vars.modeldim = self.get_hidden_size()