mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-01-26 15:05:05 +01:00
Fix an error that occurs when all layers are on second GPU
This commit is contained in:
parent
5e3e3f3578
commit
19d2356253
12
aiserver.py
12
aiserver.py
@ -122,6 +122,7 @@ class vars:
|
||||
widepth = 3 # How many historical actions to scan for WI hits
|
||||
mode = "play" # Whether the interface is in play, memory, or edit mode
|
||||
editln = 0 # Which line was last selected in Edit Mode
|
||||
gpu_device = 0 # Which PyTorch device to use when using pure GPU generation
|
||||
url = "https://api.inferkit.com/v1/models/standard/generate" # InferKit API URL
|
||||
oaiurl = "" # OpenAI API URL
|
||||
oaiengines = "https://api.openai.com/v1/engines"
|
||||
@ -311,7 +312,8 @@ def device_config(model):
|
||||
if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, model.config.num_layers if hasattr(model.config, "num_layers") else model.config.n_layer)):
|
||||
vars.breakmodel = False
|
||||
vars.usegpu = True
|
||||
model = model.half().to(len(breakmodel.gpu_blocks)-1)
|
||||
vars.gpu_device = len(breakmodel.gpu_blocks)-1
|
||||
model = model.half().to(vars.gpu_device)
|
||||
generator = model.generate
|
||||
return
|
||||
|
||||
@ -822,7 +824,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
|
||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||
if(vars.hascuda):
|
||||
if(vars.usegpu):
|
||||
model = model.half().to(0)
|
||||
model = model.half().to(vars.gpu_device)
|
||||
generator = model.generate
|
||||
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
||||
device_config(model)
|
||||
@ -842,7 +844,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
|
||||
vars.modeldim = get_hidden_size_from_model(model)
|
||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||
if(vars.hascuda and vars.usegpu):
|
||||
model = model.half().to(0)
|
||||
model = model.half().to(vars.gpu_device)
|
||||
generator = model.generate
|
||||
else:
|
||||
model = model.to('cpu').float()
|
||||
@ -856,7 +858,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
|
||||
with(maybe_use_float16()):
|
||||
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
|
||||
vars.modeldim = get_hidden_size_from_model(model)
|
||||
model = model.half().to(0)
|
||||
model = model.half().to(vars.gpu_device)
|
||||
generator = model.generate
|
||||
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
||||
with(maybe_use_float16()):
|
||||
@ -2186,7 +2188,7 @@ def _generate(txt, minimum, maximum, found_entries):
|
||||
gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1)
|
||||
|
||||
if(vars.hascuda and vars.usegpu):
|
||||
gen_in = gen_in.to(0)
|
||||
gen_in = gen_in.to(vars.gpu_device)
|
||||
elif(vars.hascuda and vars.breakmodel):
|
||||
gen_in = gen_in.to(breakmodel.primary_device)
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user