Fix for loading model multiple times loosing the gpu/cpu splits

2025-06-05 21:59:24 +02:00 · 2023-05-22 20:34:01 -04:00
parent 9e53bcf676
commit 4c25d6fbbb
2 changed files with 3 additions and 6 deletions
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -197,12 +197,6 @@ class HFInferenceModel(InferenceModel):
                torch.cuda.empty_cache()
        except:
            pass
-        if self.hf_torch:
-            if 'breakmodel' in sys.modules:
-                import breakmodel
-                breakmodel.breakmodel = True
-                breakmodel.gpu_blocks = []
-                breakmodel.disk_blocks = 0

    def _post_load(self) -> None:
        # These are model specific tokenizer overrides if a model has bad defaults
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -788,6 +788,7 @@ class HFTorchInferenceModel(HFInferenceModel):
        if device_count < 2:
            primary = None
        logger.debug("n_layers: {}".format(n_layers))
+        logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
        gpu_blocks = breakmodel.gpu_blocks + (
            device_count - len(breakmodel.gpu_blocks)
        ) * [0]
@@ -818,6 +819,8 @@ class HFTorchInferenceModel(HFInferenceModel):

        n_layers = utils.num_layers(config)

+        logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
+
        if utils.args.cpu:
            breakmodel.gpu_blocks = [0] * n_layers
            return