Patches: Make lazyload work on quantized

i wanna watch youtube while my model is loading without locking up my system >:(
2025-06-05 21:59:24 +02:00 · 2023-07-17 16:47:31 -05:00
parent e8d84bb787
commit 23b95343bd
2 changed files with 2 additions and 3 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -78,7 +78,6 @@ class model_backend(HFTorchInferenceModel):
        }
        
        if self.use_4_bit:
-            self.lazy_load = False
            tf_kwargs.update({
                "quantization_config":BitsAndBytesConfig(
                    load_in_4bit=True,
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -181,7 +181,7 @@ class LazyloadPatches:
        is_quantized = is_quantized or load_in_8bit

        if is_quantized:
-            from .utils.bitsandbytes import set_module_8bit_tensor_to_device
+            from transformers.utils.bitsandbytes import set_module_quantized_tensor_to_device

        error_msgs = []

@@ -299,7 +299,7 @@ class LazyloadPatches:
                    fp16_statistics = None

                if "SCB" not in param_name:
-                    set_module_8bit_tensor_to_device(
+                    set_module_quantized_tensor_to_device(
                        model,
                        param_name,
                        param_device,