It just works.

This commit is contained in:
catboxanon
2023-03-10 00:01:40 -05:00
committed by GitHub
parent 8c9ed55406
commit dcf9d37a00

View File

@@ -1142,9 +1142,9 @@ def move_model_to_devices(model):
if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel): if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
if(koboldai_vars.usegpu): if(koboldai_vars.usegpu):
model = model.half().to(koboldai_vars.gpu_device) model = model.to(koboldai_vars.gpu_device)
else: else:
model = model.to('cpu').float() model = model.to('cpu')
generator = model.generate generator = model.generate
return return
@@ -1172,7 +1172,6 @@ def move_model_to_devices(model):
generator = model.generate generator = model.generate
return return
model.half()
gc.collect() gc.collect()
if(hasattr(model, "transformer")): if(hasattr(model, "transformer")):
@@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
#print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
model_dict[key] = model_dict[key].materialize(f, map_location="cpu") model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
if model_dict[key].dtype is torch.float32: # if model_dict[key].dtype is torch.float32:
koboldai_vars.fp32_model = True # koboldai_vars.fp32_model = True
if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
model_dict[key] = model_dict[key].to(torch.float16) # model_dict[key] = model_dict[key].to(torch.float16)
if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
model_dict[key] = model_dict[key].to(torch.float32) model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared": if device == "shared":
@@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
if utils.offload_index: if utils.offload_index:
for name, tensor in utils.named_buffers: for name, tensor in utils.named_buffers:
dtype = tensor.dtype dtype = tensor.dtype
if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
dtype = torch.float16 # dtype = torch.float16
if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
dtype = torch.float32 # dtype = torch.float32
if name in model_dict and model_dict[name].dtype is not dtype: # if name in model_dict and model_dict[name].dtype is not dtype:
model_dict[name] = model_dict[name].to(dtype) # model_dict[name] = model_dict[name].to(dtype)
if tensor.dtype is not dtype: # if tensor.dtype is not dtype:
tensor = tensor.to(dtype) # tensor = tensor.to(dtype)
if name not in utils.offload_index: # if name not in utils.offload_index:
accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) # accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
utils.bar.close() utils.bar.close()
utils.bar = None utils.bar = None
@@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
# Is CUDA available? If so, use GPU, otherwise fall back to CPU # Is CUDA available? If so, use GPU, otherwise fall back to CPU
if(koboldai_vars.hascuda and koboldai_vars.usegpu): if(koboldai_vars.hascuda and koboldai_vars.usegpu):
model = model.half().to(koboldai_vars.gpu_device) model = model.to(koboldai_vars.gpu_device)
generator = model.generate generator = model.generate
else: else:
model = model.to('cpu').float() model = model.to('cpu')
generator = model.generate generator = model.generate
patch_causallm(model) patch_causallm(model)
# Use the Generic implementation # Use the Generic implementation
@@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
try: try:
# model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4) model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
except Exception as e: except Exception as e:
if("out of memory" in traceback.format_exc().lower()): if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
@@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
import shutil import shutil
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
model = model.half()
model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
else: # For fp16 models, we can just copy the model files directly else: # For fp16 models, we can just copy the model files directly
import transformers.configuration_utils import transformers.configuration_utils
@@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
if(koboldai_vars.hascuda): if(koboldai_vars.hascuda):
if(koboldai_vars.usegpu): if(koboldai_vars.usegpu):
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
model = model.half().to(koboldai_vars.gpu_device) model = model.to(koboldai_vars.gpu_device)
generator = model.generate generator = model.generate
elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel)
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
@@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate
else: else:
model = model.to('cpu').float() model = model.to('cpu')
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate
elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
@@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate
else: else:
model.to('cpu').float() model.to('cpu')
koboldai_vars.modeldim = get_hidden_size_from_model(model) koboldai_vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate