diff --git a/aiserver.py b/aiserver.py index cbb15dc0..b2d22e56 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3779,7 +3779,7 @@ def calcsubmit(txt): bias[i] = b["multiplier"] - device = model.get_auxilary_device() + device = utils.get_auxilary_device() attention_bias.attention_bias = torch.Tensor(bias).to(device) logger.info(f"Bias by {koboldai_vars.memory_attn_bias} -- {attention_bias.attention_bias}") logger.debug("Submit: experimental_features time {}s".format(time.time()-start_time)) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 8d06ff6e..a10a48f3 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -46,9 +46,14 @@ class BreakmodelConfig: def __init__(self) -> None: self.disk_blocks = 0 self.gpu_blocks = [] + self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" def get_device_map(self, model: nn.Module) -> dict: + # HACK + if utils.args.cpu: + self.primary_device = "cpu" + ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks) cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks)) device_map = {} @@ -311,10 +316,21 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: + print(f"self.lazy_load {self.lazy_load}") + print(f"self.breakmodel {self.breakmodel}") + print(f"self.nobreakmodel {self.nobreakmodel}") + print(f"args.cpu {utils.args.cpu}") + if self.lazy_load: with lazy_loader.use_lazy_load(dematerialized_modules=True): metamodel = AutoModelForCausalLM.from_config(self.model_config) - tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) + if utils.args.cpu: + cpu_map = {name: "cpu" for name in utils.layers_module_names} + for name in utils.get_missing_module_names(metamodel, list(cpu_map.keys())): + cpu_map[name] = "cpu" + tf_kwargs["device_map"] = cpu_map + else: + tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) with lazy_loader.use_lazy_load( enable=self.lazy_load, diff --git a/utils.py b/utils.py index 863bda2f..0d30f194 100644 --- a/utils.py +++ b/utils.py @@ -655,10 +655,13 @@ def get_auxilary_device(): # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. if koboldai_vars.hascuda and koboldai_vars.usegpu: + print("GP") return koboldai_vars.gpu_device elif koboldai_vars.hascuda: # TODO: Primary device + print("CUDA") return "cuda" + print("CPU") return "cpu" #==================================================================#