diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index e801eab2..b50ebf56 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -197,12 +197,6 @@ class HFInferenceModel(InferenceModel): torch.cuda.empty_cache() except: pass - if self.hf_torch: - if 'breakmodel' in sys.modules: - import breakmodel - breakmodel.breakmodel = True - breakmodel.gpu_blocks = [] - breakmodel.disk_blocks = 0 def _post_load(self) -> None: # These are model specific tokenizer overrides if a model has bad defaults diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index c5560360..681d3ab1 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -788,6 +788,7 @@ class HFTorchInferenceModel(HFInferenceModel): if device_count < 2: primary = None logger.debug("n_layers: {}".format(n_layers)) + logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks)) gpu_blocks = breakmodel.gpu_blocks + ( device_count - len(breakmodel.gpu_blocks) ) * [0] @@ -818,6 +819,8 @@ class HFTorchInferenceModel(HFInferenceModel): n_layers = utils.num_layers(config) + logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks)) + if utils.args.cpu: breakmodel.gpu_blocks = [0] * n_layers return