diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index a0ac9b55..83a5a318 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -124,7 +124,8 @@ class model_backend(HFTorchInferenceModel): # We must disable low_cpu_mem_usage and if using a GPT-2 model # because GPT-2 is not compatible with this feature yet. tf_kwargs.pop("low_cpu_mem_usage", None) - + tf_kwargs.pop("quantization_config", None) + # Also, lazy loader doesn't support GPT-2 models self.lazy_load = False diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 0b55e8dc..4ddf895b 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -126,8 +126,13 @@ class HFTorchInferenceModel(HFInferenceModel): return ret def get_auxilary_device(self) -> Union[str, int, torch.device]: - return self.breakmodel_config.primary_device - + if self.breakmodel: + return self.breakmodel_config.primary_device + if self.usegpu: + return "cuda:0" + else: + return "cpu" + def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: if self.breakmodel_config.primary_device == "cpu": return torch.float32