Fix for loading model multiple times loosing the gpu/cpu splits

This commit is contained in:
ebolam
2023-05-22 20:34:01 -04:00
parent 9e53bcf676
commit 4c25d6fbbb
2 changed files with 3 additions and 6 deletions

View File

@@ -197,12 +197,6 @@ class HFInferenceModel(InferenceModel):
torch.cuda.empty_cache()
except:
pass
if self.hf_torch:
if 'breakmodel' in sys.modules:
import breakmodel
breakmodel.breakmodel = True
breakmodel.gpu_blocks = []
breakmodel.disk_blocks = 0
def _post_load(self) -> None:
# These are model specific tokenizer overrides if a model has bad defaults

View File

@@ -788,6 +788,7 @@ class HFTorchInferenceModel(HFInferenceModel):
if device_count < 2:
primary = None
logger.debug("n_layers: {}".format(n_layers))
logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
gpu_blocks = breakmodel.gpu_blocks + (
device_count - len(breakmodel.gpu_blocks)
) * [0]
@@ -818,6 +819,8 @@ class HFTorchInferenceModel(HFInferenceModel):
n_layers = utils.num_layers(config)
logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
if utils.args.cpu:
breakmodel.gpu_blocks = [0] * n_layers
return