Revert back out VE 8 bit loading

2025-06-05 21:59:24 +02:00 · 2022-12-01 14:54:44 -05:00
parent ebdae49746
commit e1f6de1250
2 changed files with 18 additions and 39 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -2465,6 +2465,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
    
    if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features:
        use_8_bit = False
+    if use_8_bit:
+        koboldai_vars.lazy_load = False
+        koboldai_vars.breakmodel = False
+    logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load))
    if(initial_load):
        use_breakmodel_args = True
    if not utils.HAS_ACCELERATE:
@@ -2676,6 +2680,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
            # Lazy loader
            import torch_lazy_loader
            def get_lazy_load_callback(n_layers, convert_to_float16=True):
+                logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load))
                if not koboldai_vars.lazy_load:
                    return

@@ -2737,8 +2742,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                        koboldai_vars.loaded_layers = 0
                        utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio())

-                    if koboldai_vars.bit_8_available:
-                        import bitsandbytes as bnb
                    with zipfile.ZipFile(f, "r") as z:
                        try:
                            last_storage_key = None
@@ -2766,11 +2769,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
                                if model_dict[key].dtype is torch.float32:
                                    koboldai_vars.fp32_model = True
-                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and any(model_dict[key].dtype is t for t in (torch.float32, torch.float16)):
-                                    if use_8_bit:
-                                        model_dict[key] = bnb.nn.Int8Params(model_dict[key].to(torch.float16), requires_grad=False, has_fp16_weights=False).to(device if device not in ("shared", "disk") else "cpu")
-                                    else:
-                                        model_dict[key] = model_dict[key].to(torch.float16)
+                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                    model_dict[key] = model_dict[key].to(torch.float16)
                                if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                    model_dict[key] = model_dict[key].to(torch.float32)
                                if device == "shared":
@@ -2882,7 +2882,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                    import shutil
                    shutil.move(koboldai_vars.model.replace('/', '_'), "models/{}".format(koboldai_vars.model.replace('/', '_')))
                if(koboldai_vars.lazy_load):  # If we're using lazy loader, we need to figure out what the model's hidden layers are called
-                    with torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, dematerialized_modules=True, use_accelerate_init_empty_weights=True):
+                    with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True):
                        try:
                            metamodel = AutoModelForCausalLM.from_config(model_config)
                        except Exception as e:
@@ -2890,7 +2890,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                        utils.layers_module_names = utils.get_layers_module_names(metamodel)
                        utils.module_names = list(metamodel.state_dict().keys())
                        utils.named_buffers = list(metamodel.named_buffers(recurse=True))
-                with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True):
+                with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True):
                    if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                        lowmem = {}
                    if(os.path.isdir(koboldai_vars.custmodpth)):
@@ -2905,12 +2905,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                except Exception as e:
                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                        try:
-                            logger.info("Using 8 bit: {}".format(use_8_bit))
                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
                        except Exception as e:
                            if("out of memory" in traceback.format_exc().lower()):
                                raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            logger.info("Automodel failed, falling back to GPTNeo")
                            model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
                    elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                        try:
@@ -2929,7 +2927,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                        except Exception as e:
                            if("out of memory" in traceback.format_exc().lower()):
                                raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            logger.info("Automodel failed, falling back to GPTNeo")
                            model     = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
                    else:
                        try:
@@ -2965,7 +2962,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                        except Exception as e:
                            if("out of memory" in traceback.format_exc().lower()):
                                raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            logger.info("Automodel failed, falling back to GPTNeo")
                            model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)

                        torch._utils._rebuild_tensor = old_rebuild_tensor
@@ -3006,7 +3002,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                if(koboldai_vars.hascuda):
                    if(koboldai_vars.usegpu):
                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        #model = model.half().to(koboldai_vars.gpu_device)
+                        model = model.half().to(koboldai_vars.gpu_device)
                        generator = model.generate
                    elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                        koboldai_vars.modeldim = get_hidden_size_from_model(model)