mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
More undo of 8 bit
This commit is contained in:
18
aiserver.py
18
aiserver.py
@@ -2461,10 +2461,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
global tokenizer
|
global tokenizer
|
||||||
koboldai_vars.aibusy = True
|
koboldai_vars.aibusy = True
|
||||||
koboldai_vars.horde_share = False
|
koboldai_vars.horde_share = False
|
||||||
if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features:
|
|
||||||
use_8_bit = False
|
|
||||||
if use_8_bit:
|
|
||||||
koboldai_vars.lazy_load = False
|
|
||||||
if(initial_load):
|
if(initial_load):
|
||||||
use_breakmodel_args = True
|
use_breakmodel_args = True
|
||||||
reset_model_settings()
|
reset_model_settings()
|
||||||
@@ -2901,11 +2897,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if("out of memory" in traceback.format_exc().lower()):
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
|
elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
||||||
@@ -2918,11 +2914,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if("out of memory" in traceback.format_exc().lower()):
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
else:
|
else:
|
||||||
old_rebuild_tensor = torch._utils._rebuild_tensor
|
old_rebuild_tensor = torch._utils._rebuild_tensor
|
||||||
def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
|
def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
|
||||||
@@ -2948,18 +2944,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if("out of memory" in traceback.format_exc().lower()):
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
|
|
||||||
torch._utils._rebuild_tensor = old_rebuild_tensor
|
torch._utils._rebuild_tensor = old_rebuild_tensor
|
||||||
|
|
||||||
if not args.colab or args.savemodel:
|
if not args.colab or args.savemodel:
|
||||||
import shutil
|
import shutil
|
||||||
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
|
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
|
||||||
if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16
|
if(koboldai_vars.fp32_model): # Use save_pretrained to convert fp32 models to fp16
|
||||||
model = model.half()
|
model = model.half()
|
||||||
model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
|
model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
|
||||||
else: # For fp16 models, we can just copy the model files directly
|
else: # For fp16 models, we can just copy the model files directly
|
||||||
|
Reference in New Issue
Block a user