It just works.

2025-06-05 21:59:24 +02:00 · 2023-03-10 00:01:40 -05:00
parent 8c9ed55406
commit dcf9d37a00
1 changed files with 22 additions and 24 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -1142,9 +1142,9 @@ def move_model_to_devices(model):

    if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
        if(koboldai_vars.usegpu):
-            model = model.half().to(koboldai_vars.gpu_device)
+            model = model.to(koboldai_vars.gpu_device)
        else:
-            model = model.to('cpu').float()
+            model = model.to('cpu')
        generator = model.generate
        return

@@ -1172,7 +1172,6 @@ def move_model_to_devices(model):
        generator = model.generate
        return

-    model.half()
    gc.collect()

    if(hasattr(model, "transformer")):
@@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                if model_dict[key].dtype is torch.float32:
-                                    koboldai_vars.fp32_model = True
-                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                    model_dict[key] = model_dict[key].to(torch.float16)
+                                # if model_dict[key].dtype is torch.float32:
+                                #     koboldai_vars.fp32_model = True
+                                # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                #     model_dict[key] = model_dict[key].to(torch.float16)
                                if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                    model_dict[key] = model_dict[key].to(torch.float32)
                                if device == "shared":
@@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                if utils.offload_index:
                                    for name, tensor in utils.named_buffers:
                                        dtype = tensor.dtype
-                                        if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                            dtype = torch.float16
-                                        if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                            dtype = torch.float32
-                                        if name in model_dict and model_dict[name].dtype is not dtype:
-                                            model_dict[name] = model_dict[name].to(dtype)
-                                        if tensor.dtype is not dtype:
-                                            tensor = tensor.to(dtype)
-                                        if name not in utils.offload_index:
-                                            accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                        #     dtype = torch.float16
+                                        # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                        #     dtype = torch.float32
+                                        # if name in model_dict and model_dict[name].dtype is not dtype:
+                                        #     model_dict[name] = model_dict[name].to(dtype)
+                                        # if tensor.dtype is not dtype:
+                                        #     tensor = tensor.to(dtype)
+                                        # if name not in utils.offload_index:
+                                        #     accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                    accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                utils.bar.close()
                                utils.bar = None
@@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                koboldai_vars.modeldim = get_hidden_size_from_model(model)
                # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                if(koboldai_vars.hascuda and koboldai_vars.usegpu):
-                    model = model.half().to(koboldai_vars.gpu_device)
+                    model = model.to(koboldai_vars.gpu_device)
                    generator = model.generate
                else:
-                    model = model.to('cpu').float()
+                    model = model.to('cpu')
                    generator = model.generate
                patch_causallm(model)
            # Use the Generic implementation
@@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                        #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                        try:
                            # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                        except Exception as e:
                            if("out of memory" in traceback.format_exc().lower()):
                                raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
@@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                            import shutil
                            tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
                            if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)):  # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
-                                model = model.half()
                                model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
                            else:  # For fp16 models, we can just copy the model files directly
                                import transformers.configuration_utils
@@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                if(koboldai_vars.hascuda):
                    if(koboldai_vars.usegpu):
                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        model = model.half().to(koboldai_vars.gpu_device)
+                        model = model.to(koboldai_vars.gpu_device)
                        generator = model.generate
                    elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
@@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
                        generator = model.generate
                    else:
-                        model = model.to('cpu').float()
+                        model = model.to('cpu')
                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
                        generator = model.generate
                elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
@@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                    koboldai_vars.modeldim = get_hidden_size_from_model(model)
                    generator = model.generate
                else:
-                    model.to('cpu').float()
+                    model.to('cpu')
                    koboldai_vars.modeldim = get_hidden_size_from_model(model)
                    generator = model.generate