diff --git a/aiserver.py b/aiserver.py index b3604785..4a302a13 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2461,16 +2461,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False - reset_model_settings() - if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: use_8_bit = False if use_8_bit: koboldai_vars.lazy_load = False - koboldai_vars.breakmodel = False - logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True + reset_model_settings() if not utils.HAS_ACCELERATE: disk_layers = None koboldai_vars.reset_model() @@ -2632,7 +2629,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal elif koboldai_vars.hascuda: if(koboldai_vars.bmsupported): koboldai_vars.usegpu = False - koboldai_vars.breakmodel = True if not use_8_bit else False + koboldai_vars.breakmodel = True else: koboldai_vars.breakmodel = False koboldai_vars.usegpu = use_gpu @@ -2680,7 +2677,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # Lazy loader import torch_lazy_loader def get_lazy_load_callback(n_layers, convert_to_float16=True): - logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if not koboldai_vars.lazy_load: return @@ -2922,17 +2918,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: - try: - torch._utils._rebuild_tensor = old_rebuild_tensor - except: - pass old_rebuild_tensor = torch._utils._rebuild_tensor def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): if(not isinstance(storage, torch_lazy_loader.LazyTensor)): @@ -2957,7 +2948,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): @@ -2966,7 +2956,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal torch._utils._rebuild_tensor = old_rebuild_tensor - if (not args.colab or args.savemodel) and not use_8_bit: + if not args.colab or args.savemodel: import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16