Polishing and Optimizations

Multiple things have changed, for now models default to half mode even on the official transformers to make sure its as efficient on the GPU as finetune's. GPU selection is streamlined and cache files are now stored inside the KoboldAI folder (for the most part). A new command line parameter to force the models to run at their full size still needs to be added for the few users that would want a quality bump at the cost of ram.
This commit is contained in:
henk717
2021-11-18 00:06:57 +01:00
parent 27ee45b9cc
commit b25c54cf91
2 changed files with 30 additions and 23 deletions

6
.gitignore vendored
View File

@@ -1,16 +1,18 @@
# Ignore client settings file
client.settings
settings/*
# Ignore stories file except for test_story
stories/*
settings/*
!stories/sample_story.json
# Ignore stuff that would polute our Git
/.project
*.bak
miniconda3/*
*.settings
__pycache__
*.log
cache/*
# Ignore PyCharm project files.
.idea

View File

@@ -278,10 +278,17 @@ def device_config(model):
if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, model.config.num_layers)):
vars.breakmodel = False
vars.usegpu = True
model = model.to(len(breakmodel.gpu_blocks)-1)
model = model.half().to(len(breakmodel.gpu_blocks)-1)
generator = model.generate
return
if(not breakmodel.gpu_blocks):
print("Nothing assigned to a GPU, reverting to CPU only mode")
vars.breakmodel = False
vars.usegpu = False
model = model.half().to('cpu')
generator = model.generate
return
model.half().to('cpu')
gc.collect()
model.transformer.wte.to(breakmodel.primary_device)
@@ -364,16 +371,14 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
vars.breakmodel = False
elif(vars.hascuda):
if(vars.bmsupported):
print(colors.YELLOW + "You're using a model that supports hybrid generation!")
print("This feature allows you to split the model between the CPU and GPU(s)")
print("(slower than GPU-only but uses less VRAM) or between multiple GPUs")
print("(allowing you to use the combined VRAM of all your GPUs).")
print("Currently only GPT-Neo and GPT-J models support this feature.")
print("{0}Use hybrid generation or CPU-only generation?: (Default hybrid){1}".format(colors.CYAN, colors.END))
print(f" 1 - Hybrid generation\n 2 - CPU\n")
genselected = True
vars.usegpu = False
vars.breakmodel = True
else:
print(" 1 - GPU\n 2 - CPU\n")
genselected = False
else:
genselected = False
if(vars.hascuda):
while(genselected == False):
@@ -619,15 +624,15 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
model_config = open(vars.custmodpth + "/config.json", "r")
js = json.load(model_config)
if("model_type" in js):
model = AutoModelForCausalLM.from_pretrained(vars.custmodpth)
model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/")
else:
model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth)
model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model)
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth)
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/")
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
if(vars.hascuda):
if(vars.usegpu):
model = model.to(0)
model = model.half().to(0)
generator = model.generate
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
device_config(model)
@@ -639,35 +644,35 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
elif(vars.model == "GPT2Custom"):
model_config = open(vars.custmodpth + "/config.json", "r")
js = json.load(model_config)
model = GPT2LMHeadModel.from_pretrained(vars.custmodpth)
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth)
model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/")
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model)
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
if(vars.hascuda and vars.usegpu):
model = model.to(0)
model = model.half().to(0)
generator = model.generate
else:
generator = model.generate
# If base HuggingFace model was chosen
else:
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
tokenizer = GPT2Tokenizer.from_pretrained(vars.model)
tokenizer = GPT2Tokenizer.from_pretrained(vars.model, cache_dir="cache/")
if(vars.hascuda):
if(vars.usegpu):
model = AutoModelForCausalLM.from_pretrained(vars.model)
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model)
model = model.to(0)
model = model.half().to(0)
generator = model.generate
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
model = AutoModelForCausalLM.from_pretrained(vars.model)
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model)
device_config(model)
else:
model = AutoModelForCausalLM.from_pretrained(vars.model)
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate
else:
model = AutoModelForCausalLM.from_pretrained(vars.model)
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate