From 4c25d6fbbbfad67176056a6f5af1826c2c2eb24c Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Mon, 22 May 2023 20:34:01 -0400
Subject: [PATCH] Fix for loading model multiple times loosing the gpu/cpu
 splits

---
 modeling/inference_models/hf.py       | 6 ------
 modeling/inference_models/hf_torch.py | 3 +++
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index e801eab2..b50ebf56 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -197,12 +197,6 @@ class HFInferenceModel(InferenceModel):
                 torch.cuda.empty_cache()
         except:
             pass
-        if self.hf_torch:
-            if 'breakmodel' in sys.modules:
-                import breakmodel
-                breakmodel.breakmodel = True
-                breakmodel.gpu_blocks = []
-                breakmodel.disk_blocks = 0
 
     def _post_load(self) -> None:
         # These are model specific tokenizer overrides if a model has bad defaults
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index c5560360..681d3ab1 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -788,6 +788,7 @@ class HFTorchInferenceModel(HFInferenceModel):
         if device_count < 2:
             primary = None
         logger.debug("n_layers: {}".format(n_layers))
+        logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
         gpu_blocks = breakmodel.gpu_blocks + (
             device_count - len(breakmodel.gpu_blocks)
         ) * [0]
@@ -818,6 +819,8 @@ class HFTorchInferenceModel(HFInferenceModel):
 
         n_layers = utils.num_layers(config)
 
+        logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
+
         if utils.args.cpu:
             breakmodel.gpu_blocks = [0] * n_layers
             return