Load model directly in fp16 if using GPU or breakmodel
This commit is contained in:
parent
95aff61781
commit
a93a76eb01
15
aiserver.py
15
aiserver.py
|
@ -15,6 +15,7 @@ import json
|
|||
import collections
|
||||
import zipfile
|
||||
import packaging
|
||||
import contextlib
|
||||
from typing import Any, Union, Dict, Set, List
|
||||
|
||||
import requests
|
||||
|
@ -710,10 +711,21 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
|
|||
return {}
|
||||
return {"low_cpu_mem_usage": True}
|
||||
|
||||
@contextlib.contextmanager
|
||||
def maybe_use_float16(always_use=False):
|
||||
if(always_use or (vars.hascuda and (vars.usegpu or vars.breakmodel))):
|
||||
original_dtype = torch.get_default_dtype()
|
||||
torch.set_default_dtype(torch.float16)
|
||||
yield True
|
||||
torch.set_default_dtype(original_dtype)
|
||||
else:
|
||||
yield False
|
||||
|
||||
# If custom GPT Neo model was chosen
|
||||
if(vars.model == "NeoCustom"):
|
||||
model_config = open(vars.custmodpth + "/config.json", "r")
|
||||
js = json.load(model_config)
|
||||
with(maybe_use_float16()):
|
||||
if("model_type" in js):
|
||||
model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
|
||||
else:
|
||||
|
@ -735,6 +747,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
|
|||
elif(vars.model == "GPT2Custom"):
|
||||
model_config = open(vars.custmodpth + "/config.json", "r")
|
||||
js = json.load(model_config)
|
||||
with(maybe_use_float16()):
|
||||
model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
|
||||
vars.modeldim = get_hidden_size_from_model(model)
|
||||
|
@ -750,11 +763,13 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
|
|||
tokenizer = GPT2Tokenizer.from_pretrained(vars.model, cache_dir="cache/")
|
||||
if(vars.hascuda):
|
||||
if(vars.usegpu):
|
||||
with(maybe_use_float16()):
|
||||
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
|
||||
vars.modeldim = get_hidden_size_from_model(model)
|
||||
model = model.half().to(0)
|
||||
generator = model.generate
|
||||
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
||||
with(maybe_use_float16()):
|
||||
model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
|
||||
vars.modeldim = get_hidden_size_from_model(model)
|
||||
device_config(model)
|
||||
|
|
Loading…
Reference in New Issue