New download machanism

Automatically converts Huggingface cache models to full models on (down)load. WARNING: Does wipe old cache/ dir inside the KoboldAI folder, make a backup before you run these models if you are bandwith constraint.
2025-06-05 21:59:24 +02:00 · 2021-12-16 01:40:04 +01:00
parent 5e3e3f3578
commit e3d9c2d690
1 changed files with 18 additions and 7 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -850,25 +850,36 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
        # If base HuggingFace model was chosen
        else:
            # Is CUDA available? If so, use GPU, otherwise fall back to CPU
            if(os.path.isdir(vars.model.replace('/', '_'))):
               with(maybe_use_float16()):
                   tokenizer = GPT2TokenizerFast.from_pretrained(vars.model.replace('/', '_'), cache_dir="cache/")
                   model = AutoModelForCausalLM.from_pretrained(vars.model.replace('/', '_'), cache_dir="cache/", **maybe_low_cpu_mem_usage())
            else:
                print("Model does not exist locally, attempting to download from Huggingface...")
                tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, cache_dir="cache/")
            if(vars.hascuda):
                if(vars.usegpu):
                with(maybe_use_float16()):
                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                model = model.half()
                import shutil
                shutil.rmtree("cache/")
                model.save_pretrained(vars.model.replace('/', '_'))
                tokenizer.save_pretrained(vars.model.replace('/', '_'))
            if(vars.hascuda):
                if(vars.usegpu):
                    vars.modeldim = get_hidden_size_from_model(model)
                    model = model.half().to(0)
                    generator = model.generate
                elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                    with(maybe_use_float16()):
                        model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                    vars.modeldim = get_hidden_size_from_model(model)
                    device_config(model)
                else:
-                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
+                    model = model.to('cpu').float()
                    vars.modeldim = get_hidden_size_from_model(model)
                    generator = model.generate
            else:
-                model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
+                model.to('cpu').float()
                vars.modeldim = get_hidden_size_from_model(model)
                generator = model.generate