Reverting farther back

This commit is contained in:
ebolam
2022-12-01 18:02:05 -05:00
parent e1f6de1250
commit b28d8e76dc

View File

@@ -2461,16 +2461,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
global tokenizer global tokenizer
koboldai_vars.aibusy = True koboldai_vars.aibusy = True
koboldai_vars.horde_share = False koboldai_vars.horde_share = False
reset_model_settings()
if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features:
use_8_bit = False use_8_bit = False
if use_8_bit: if use_8_bit:
koboldai_vars.lazy_load = False koboldai_vars.lazy_load = False
koboldai_vars.breakmodel = False
logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load))
if(initial_load): if(initial_load):
use_breakmodel_args = True use_breakmodel_args = True
reset_model_settings()
if not utils.HAS_ACCELERATE: if not utils.HAS_ACCELERATE:
disk_layers = None disk_layers = None
koboldai_vars.reset_model() koboldai_vars.reset_model()
@@ -2632,7 +2629,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
elif koboldai_vars.hascuda: elif koboldai_vars.hascuda:
if(koboldai_vars.bmsupported): if(koboldai_vars.bmsupported):
koboldai_vars.usegpu = False koboldai_vars.usegpu = False
koboldai_vars.breakmodel = True if not use_8_bit else False koboldai_vars.breakmodel = True
else: else:
koboldai_vars.breakmodel = False koboldai_vars.breakmodel = False
koboldai_vars.usegpu = use_gpu koboldai_vars.usegpu = use_gpu
@@ -2680,7 +2677,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
# Lazy loader # Lazy loader
import torch_lazy_loader import torch_lazy_loader
def get_lazy_load_callback(n_layers, convert_to_float16=True): def get_lazy_load_callback(n_layers, convert_to_float16=True):
logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load))
if not koboldai_vars.lazy_load: if not koboldai_vars.lazy_load:
return return
@@ -2922,17 +2918,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
except Exception as e: except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
try: try:
logger.info("Using 8 bit: {}".format(use_8_bit))
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
except Exception as e: except Exception as e:
if("out of memory" in traceback.format_exc().lower()): if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
else: else:
try:
torch._utils._rebuild_tensor = old_rebuild_tensor
except:
pass
old_rebuild_tensor = torch._utils._rebuild_tensor old_rebuild_tensor = torch._utils._rebuild_tensor
def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
if(not isinstance(storage, torch_lazy_loader.LazyTensor)): if(not isinstance(storage, torch_lazy_loader.LazyTensor)):
@@ -2957,7 +2948,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
except Exception as e: except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
try: try:
logger.info("Using 8 bit: {}".format(use_8_bit))
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
except Exception as e: except Exception as e:
if("out of memory" in traceback.format_exc().lower()): if("out of memory" in traceback.format_exc().lower()):
@@ -2966,7 +2956,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
torch._utils._rebuild_tensor = old_rebuild_tensor torch._utils._rebuild_tensor = old_rebuild_tensor
if (not args.colab or args.savemodel) and not use_8_bit: if not args.colab or args.savemodel:
import shutil import shutil
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16 if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16