diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 3ff38d33..b17d04bf 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -295,7 +295,11 @@ class model_backend(InferenceModel): ) def _get_model(self, location: str, tf_kwargs: Dict): + if not self.model_config: + ExLlamaConfig(os.path.join(location, "config.json")) + _, self.model_config.model_path = load_model_gptq_settings(location) + # self.model_config.gpu_peer_fix = True return ExLlama(self.model_config) def _get_tokenizer(self, location: str): @@ -351,6 +355,7 @@ class model_backend(InferenceModel): layers.append(parameters["{}_Layers".format(i)]) self.layers = layers + self.model_config.device_map.layers = [] for i, l in enumerate(layers): if l > 0: self.model_config.device_map.layers.extend([f"cuda:{i}"] * l)