Polishing and Optimizations

Multiple things have changed, for now models default to half mode even on the official transformers to make sure its as efficient on the GPU as finetune's. GPU selection is streamlined and cache files are now stored inside the KoboldAI folder (for the most part). A new command line parameter to force the models to run at their full size still needs to be added for the few users that would want a quality bump at the cost of ram.
This commit is contained in:
henk717 2021-11-18 00:06:57 +01:00
parent 27ee45b9cc
commit b25c54cf91
2 changed files with 30 additions and 23 deletions

6
.gitignore vendored
View File

@ -1,16 +1,18 @@
# Ignore client settings file # Ignore client settings file
client.settings settings/*
# Ignore stories file except for test_story # Ignore stories file except for test_story
stories/* stories/*
settings/*
!stories/sample_story.json !stories/sample_story.json
# Ignore stuff that would polute our Git
/.project /.project
*.bak *.bak
miniconda3/* miniconda3/*
*.settings *.settings
__pycache__ __pycache__
*.log *.log
cache/*
# Ignore PyCharm project files. # Ignore PyCharm project files.
.idea .idea

View File

@ -278,10 +278,17 @@ def device_config(model):
if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, model.config.num_layers)): if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, model.config.num_layers)):
vars.breakmodel = False vars.breakmodel = False
vars.usegpu = True vars.usegpu = True
model = model.to(len(breakmodel.gpu_blocks)-1) model = model.half().to(len(breakmodel.gpu_blocks)-1)
generator = model.generate generator = model.generate
return return
if(not breakmodel.gpu_blocks):
print("Nothing assigned to a GPU, reverting to CPU only mode")
vars.breakmodel = False
vars.usegpu = False
model = model.half().to('cpu')
generator = model.generate
return
model.half().to('cpu') model.half().to('cpu')
gc.collect() gc.collect()
model.transformer.wte.to(breakmodel.primary_device) model.transformer.wte.to(breakmodel.primary_device)
@ -364,16 +371,14 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
vars.breakmodel = False vars.breakmodel = False
elif(vars.hascuda): elif(vars.hascuda):
if(vars.bmsupported): if(vars.bmsupported):
print(colors.YELLOW + "You're using a model that supports hybrid generation!") genselected = True
print("This feature allows you to split the model between the CPU and GPU(s)") vars.usegpu = False
print("(slower than GPU-only but uses less VRAM) or between multiple GPUs") vars.breakmodel = True
print("(allowing you to use the combined VRAM of all your GPUs).")
print("Currently only GPT-Neo and GPT-J models support this feature.")
print("{0}Use hybrid generation or CPU-only generation?: (Default hybrid){1}".format(colors.CYAN, colors.END))
print(f" 1 - Hybrid generation\n 2 - CPU\n")
else: else:
print(" 1 - GPU\n 2 - CPU\n") print(" 1 - GPU\n 2 - CPU\n")
genselected = False genselected = False
else:
genselected = False
if(vars.hascuda): if(vars.hascuda):
while(genselected == False): while(genselected == False):
@ -619,15 +624,15 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
model_config = open(vars.custmodpth + "/config.json", "r") model_config = open(vars.custmodpth + "/config.json", "r")
js = json.load(model_config) js = json.load(model_config)
if("model_type" in js): if("model_type" in js):
model = AutoModelForCausalLM.from_pretrained(vars.custmodpth) model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/")
else: else:
model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth) model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth) tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/")
# Is CUDA available? If so, use GPU, otherwise fall back to CPU # Is CUDA available? If so, use GPU, otherwise fall back to CPU
if(vars.hascuda): if(vars.hascuda):
if(vars.usegpu): if(vars.usegpu):
model = model.to(0) model = model.half().to(0)
generator = model.generate generator = model.generate
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel) elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
device_config(model) device_config(model)
@ -639,35 +644,35 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
elif(vars.model == "GPT2Custom"): elif(vars.model == "GPT2Custom"):
model_config = open(vars.custmodpth + "/config.json", "r") model_config = open(vars.custmodpth + "/config.json", "r")
js = json.load(model_config) js = json.load(model_config)
model = GPT2LMHeadModel.from_pretrained(vars.custmodpth) model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/")
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth) tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
# Is CUDA available? If so, use GPU, otherwise fall back to CPU # Is CUDA available? If so, use GPU, otherwise fall back to CPU
if(vars.hascuda and vars.usegpu): if(vars.hascuda and vars.usegpu):
model = model.to(0) model = model.half().to(0)
generator = model.generate generator = model.generate
else: else:
generator = model.generate generator = model.generate
# If base HuggingFace model was chosen # If base HuggingFace model was chosen
else: else:
# Is CUDA available? If so, use GPU, otherwise fall back to CPU # Is CUDA available? If so, use GPU, otherwise fall back to CPU
tokenizer = GPT2Tokenizer.from_pretrained(vars.model) tokenizer = GPT2Tokenizer.from_pretrained(vars.model, cache_dir="cache/")
if(vars.hascuda): if(vars.hascuda):
if(vars.usegpu): if(vars.usegpu):
model = AutoModelForCausalLM.from_pretrained(vars.model) model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
model = model.to(0) model = model.half().to(0)
generator = model.generate generator = model.generate
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel) elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
model = AutoModelForCausalLM.from_pretrained(vars.model) model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
device_config(model) device_config(model)
else: else:
model = AutoModelForCausalLM.from_pretrained(vars.model) model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate
else: else:
model = AutoModelForCausalLM.from_pretrained(vars.model) model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate