mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-02-21 22:17:43 +01:00
Don't use fallback loading if we run out of memory during loading
This commit is contained in:
parent
fb2b6f1026
commit
96d3d397ab
14
aiserver.py
14
aiserver.py
@ -35,6 +35,7 @@ import bleach
|
|||||||
import itertools
|
import itertools
|
||||||
import bisect
|
import bisect
|
||||||
import functools
|
import functools
|
||||||
|
import traceback
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List
|
from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List
|
||||||
|
|
||||||
@ -1763,7 +1764,12 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
model_config = open(vars.custmodpth + "/config.json", "r")
|
model_config = open(vars.custmodpth + "/config.json", "r")
|
||||||
js = json.load(model_config)
|
js = json.load(model_config)
|
||||||
with(maybe_use_float16()):
|
with(maybe_use_float16()):
|
||||||
model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
|
try:
|
||||||
|
model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
|
||||||
|
except Exception as e:
|
||||||
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
|
raise e
|
||||||
tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
|
tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
|
||||||
vars.modeldim = get_hidden_size_from_model(model)
|
vars.modeldim = get_hidden_size_from_model(model)
|
||||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||||
@ -1808,6 +1814,8 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
|
model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
|
model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
|
||||||
elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
|
elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
|
||||||
try:
|
try:
|
||||||
@ -1820,6 +1828,8 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
|
model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
|
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
|
||||||
else:
|
else:
|
||||||
old_rebuild_tensor = torch._utils._rebuild_tensor
|
old_rebuild_tensor = torch._utils._rebuild_tensor
|
||||||
@ -1845,6 +1855,8 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
|
model = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
|
model = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
|
||||||
|
|
||||||
torch._utils._rebuild_tensor = old_rebuild_tensor
|
torch._utils._rebuild_tensor = old_rebuild_tensor
|
||||||
|
Loading…
x
Reference in New Issue
Block a user