mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Kind of working breakmodel
This commit is contained in:
@@ -30,7 +30,6 @@ class model_loader(HFTorchInferenceModel):
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
utils.koboldai_vars.allowsp = True
|
||||
self.lazy_load = utils.koboldai_vars.lazy_load
|
||||
|
||||
# Make model path the same as the model name to make this consistent
|
||||
# with the other loading method if it isn't a known model type. This
|
||||
@@ -69,12 +68,14 @@ class model_loader(HFTorchInferenceModel):
|
||||
|
||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||
# early so that it knows where to load the individual model tensors
|
||||
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
|
||||
if (
|
||||
self.lazy_load
|
||||
and utils.koboldai_vars.hascuda
|
||||
and utils.koboldai_vars.breakmodel
|
||||
and not utils.koboldai_vars.nobreakmodel
|
||||
and self.breakmodel
|
||||
and not self.nobreakmodel
|
||||
):
|
||||
logger.debug("loading breakmodel")
|
||||
self.breakmodel_device_config(self.model_config)
|
||||
|
||||
if self.lazy_load:
|
||||
|
31
modeling/inference_models/gooseai.py
Normal file
31
modeling/inference_models/gooseai.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
from modeling.inference_models.parents.openai_gooseai import model_loader as openai_gooseai_model_loader
|
||||
|
||||
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
def __init__(self, error_type: str, error_message) -> None:
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class model_loader(openai_gooseai_model_loader):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.url = "https://api.goose.ai/v1/engines"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "GooseAI"
|
@@ -27,7 +27,7 @@ class model_loader(HFInferenceModel):
|
||||
#model_name: str,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.hf_torch = False
|
||||
self.model_config = None
|
||||
self.capabilties = ModelCapabilities(
|
||||
embedding_manipulation=False,
|
||||
|
@@ -11,6 +11,8 @@ from modeling.inference_model import (
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
from modeling.inference_models.parents.openai_gooseai import model_loader as openai_gooseai_model_loader
|
||||
|
||||
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
@@ -18,172 +20,12 @@ class OpenAIAPIError(Exception):
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class model_loader(InferenceModel):
|
||||
class model_loader(openai_gooseai_model_loader):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.key = ""
|
||||
self.url = "https://api.openai.com/v1/engines"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "OAI" or model_name == "GooseAI"
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path):
|
||||
self.source = model_name
|
||||
requested_parameters = []
|
||||
requested_parameters.extend([{
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "Key",
|
||||
"id": "key",
|
||||
"default": "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": True,
|
||||
"extra_classes": ""
|
||||
},
|
||||
{
|
||||
"uitype": "dropdown",
|
||||
"unit": "text",
|
||||
"label": "Model",
|
||||
"id": "model",
|
||||
"default": "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "Which model to use when running OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": False,
|
||||
"extra_classes": "",
|
||||
'children': self.get_oai_models(),
|
||||
|
||||
}])
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
self.key = parameters['key'].strip()
|
||||
self.model = parameters['model']
|
||||
|
||||
def get_oai_models(self):
|
||||
if self.key == "":
|
||||
return []
|
||||
if self.source == 'OAI':
|
||||
url = "https://api.openai.com/v1/engines"
|
||||
elif self.source == 'GooseAI':
|
||||
url = "https://api.goose.ai/v1/engines"
|
||||
else:
|
||||
return
|
||||
|
||||
# Get list of models from OAI
|
||||
logger.init("OAI Engines", status="Retrieving")
|
||||
req = requests.get(
|
||||
url,
|
||||
headers = {
|
||||
'Authorization': 'Bearer '+self.key
|
||||
}
|
||||
)
|
||||
if(req.status_code == 200):
|
||||
r = req.json()
|
||||
engines = r["data"]
|
||||
try:
|
||||
engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines]
|
||||
except:
|
||||
logger.error(engines)
|
||||
raise
|
||||
|
||||
online_model = ""
|
||||
|
||||
|
||||
logger.init_ok("OAI Engines", status="OK")
|
||||
return engines
|
||||
else:
|
||||
# Something went wrong, print the message and quit since we can't initialize an engine
|
||||
logger.init_err("OAI Engines", status="Failed")
|
||||
logger.error(req.json())
|
||||
emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
|
||||
return []
|
||||
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self.tokenizer = self._get_tokenizer("gpt2")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
max_new: int,
|
||||
gen_settings: GenerationSettings,
|
||||
single_line: bool = False,
|
||||
batch_count: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> GenerationResult:
|
||||
|
||||
if seed is not None:
|
||||
logger.warning(
|
||||
"Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored."
|
||||
)
|
||||
|
||||
decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
|
||||
|
||||
# Store context in memory to use it for comparison with generated content
|
||||
utils.koboldai_vars.lastctx = decoded_prompt
|
||||
|
||||
# Build request JSON data
|
||||
# GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
|
||||
# as the koboldai_vars.model will always be OAI
|
||||
if "GooseAI" in utils.koboldai_vars.configname:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_a": gen_settings.top_a,
|
||||
"top_p": gen_settings.top_p,
|
||||
"top_k": gen_settings.top_k,
|
||||
"tfs": gen_settings.tfs,
|
||||
"typical_p": gen_settings.typical,
|
||||
"repetition_penalty": gen_settings.rep_pen,
|
||||
"repetition_penalty_slope": gen_settings.rep_pen_slope,
|
||||
"repetition_penalty_range": gen_settings.rep_pen_range,
|
||||
"n": batch_count,
|
||||
# TODO: Implement streaming
|
||||
"stream": False,
|
||||
}
|
||||
else:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_p": gen_settings.top_p,
|
||||
"frequency_penalty": gen_settings.rep_pen,
|
||||
"n": batch_count,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
req = requests.post(
|
||||
utils.koboldai_vars.oaiurl,
|
||||
json=reqdata,
|
||||
headers={
|
||||
"Authorization": "Bearer " + utils.koboldai_vars.oaiapikey,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
j = req.json()
|
||||
|
||||
if not req.ok:
|
||||
# Send error message to web client
|
||||
if "error" in j:
|
||||
error_type = j["error"]["type"]
|
||||
error_message = j["error"]["message"]
|
||||
else:
|
||||
error_type = "Unknown"
|
||||
error_message = "Unknown"
|
||||
raise OpenAIAPIError(error_type, error_message)
|
||||
|
||||
outputs = [out["text"] for out in j["choices"]]
|
||||
return GenerationResult(
|
||||
model=self,
|
||||
out_batches=np.array([self.tokenizer.encode(x) for x in outputs]),
|
||||
prompt=prompt_tokens,
|
||||
is_whole_generation=True,
|
||||
single_line=single_line,
|
||||
)
|
||||
return model_name == "OAI"
|
@@ -22,18 +22,19 @@ class HFInferenceModel(InferenceModel):
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
try:
|
||||
if model_path is not None and os.path.exists(model_path):
|
||||
model_config = AutoConfig.from_pretrained(model_path)
|
||||
self.model_config = AutoConfig.from_pretrained(model_path)
|
||||
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
|
||||
model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
else:
|
||||
model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path):
|
||||
requested_parameters = []
|
||||
|
||||
if not self.hf_torch:
|
||||
return []
|
||||
if model_path is not None and os.path.exists(model_path):
|
||||
self.model_config = AutoConfig.from_pretrained(model_path)
|
||||
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
|
||||
@@ -124,14 +125,20 @@ class HFInferenceModel(InferenceModel):
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
gpu_count = torch.cuda.device_count()
|
||||
layers = []
|
||||
for i in range(gpu_count):
|
||||
layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None)
|
||||
self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
|
||||
self.layers = layers
|
||||
self.disk_layers = parameters['disk_layers'] if 'disk_layers' in parameters else None
|
||||
self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
|
||||
if self.hf_torch:
|
||||
import breakmodel
|
||||
gpu_count = torch.cuda.device_count()
|
||||
layers = []
|
||||
for i in range(gpu_count):
|
||||
layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None)
|
||||
self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
|
||||
self.layers = layers
|
||||
self.disk_layers = int(parameters['disk_layers']) if 'disk_layers' in parameters and parameters['disk_layers'].isnumeric() else 0
|
||||
breakmodel.gpu_blocks = layers
|
||||
breakmodel.disk_blocks = self.disk_layers
|
||||
self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
|
||||
self.model_type = self.get_model_type()
|
||||
self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel
|
||||
self.model_name = parameters['id']
|
||||
self.path = parameters['path'] if 'path' in parameters else None
|
||||
|
||||
@@ -157,6 +164,10 @@ class HFInferenceModel(InferenceModel):
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
if self.hf_torch:
|
||||
breakmodel.breakmodel = True
|
||||
breakmodel.gpu_blocks = []
|
||||
breakmodel.disk_blocks = 0
|
||||
|
||||
def _post_load(self) -> None:
|
||||
# These are model specific tokenizer overrides if a model has bad defaults
|
||||
|
@@ -53,15 +53,12 @@ LOG_SAMPLER_NO_EFFECT = False
|
||||
|
||||
|
||||
class HFTorchInferenceModel(HFInferenceModel):
|
||||
def __init__(
|
||||
self,
|
||||
#model_name: str,
|
||||
#lazy_load: bool,
|
||||
#low_mem: bool,
|
||||
) -> None:
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
#self.lazy_load = lazy_load
|
||||
#self.low_mem = low_mem
|
||||
self.hf_torch = True
|
||||
self.lazy_load = True
|
||||
self.low_mem = False
|
||||
self.nobreakmodel = False
|
||||
|
||||
self.post_token_hooks = [
|
||||
PostTokenHooks.stream_tokens,
|
||||
@@ -398,7 +395,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
Embedding._koboldai_patch_causallm_model = self.model
|
||||
|
||||
def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
|
||||
if not utils.koboldai_vars.lazy_load:
|
||||
if not self.lazy_load:
|
||||
return
|
||||
|
||||
if utils.args.breakmodel_disklayers is not None:
|
||||
@@ -819,14 +816,14 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
elif (
|
||||
utils.args.breakmodel_gpulayers is not None
|
||||
or utils.args.breakmodel_disklayers is not None
|
||||
or breakmodel.gpu_blocks != []
|
||||
):
|
||||
try:
|
||||
if not utils.args.breakmodel_gpulayers:
|
||||
breakmodel.gpu_blocks = []
|
||||
else:
|
||||
breakmodel.gpu_blocks = list(
|
||||
map(int, utils.args.breakmodel_gpulayers.split(","))
|
||||
)
|
||||
if breakmodel.gpu_blocks == []:
|
||||
if utils.args.breakmodel_gpulayers:
|
||||
breakmodel.gpu_blocks = list(
|
||||
map(int, utils.args.breakmodel_gpulayers.split(","))
|
||||
)
|
||||
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
|
||||
s = n_layers
|
||||
for i in range(len(breakmodel.gpu_blocks)):
|
||||
|
189
modeling/inference_models/parents/openai_gooseai.py
Normal file
189
modeling/inference_models/parents/openai_gooseai.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
def __init__(self, error_type: str, error_message) -> None:
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class model_loader(InferenceModel):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.key = ""
|
||||
self.url = "https://api.goose.ai/v1/engines"
|
||||
#if self.source == 'OAI':
|
||||
# url = "https://api.openai.com/v1/engines"
|
||||
#elif self.source == 'GooseAI':
|
||||
# url = "https://api.goose.ai/v1/engines"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "OAI" or model_name == "GooseAI"
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path):
|
||||
self.source = model_name
|
||||
requested_parameters = []
|
||||
requested_parameters.extend([{
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "Key",
|
||||
"id": "key",
|
||||
"default": "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": True,
|
||||
"extra_classes": ""
|
||||
},
|
||||
{
|
||||
"uitype": "dropdown",
|
||||
"unit": "text",
|
||||
"label": "Model",
|
||||
"id": "model",
|
||||
"default": "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "Which model to use when running OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": False,
|
||||
"extra_classes": "",
|
||||
'children': self.get_oai_models(),
|
||||
|
||||
}])
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
self.key = parameters['key'].strip()
|
||||
self.model = parameters['model']
|
||||
|
||||
def get_oai_models(self):
|
||||
if self.key == "":
|
||||
return []
|
||||
|
||||
|
||||
# Get list of models from OAI
|
||||
logger.init("OAI Engines", status="Retrieving")
|
||||
req = requests.get(
|
||||
self.url,
|
||||
headers = {
|
||||
'Authorization': 'Bearer '+self.key
|
||||
}
|
||||
)
|
||||
if(req.status_code == 200):
|
||||
r = req.json()
|
||||
engines = r["data"]
|
||||
try:
|
||||
engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines]
|
||||
except:
|
||||
logger.error(engines)
|
||||
raise
|
||||
|
||||
online_model = ""
|
||||
|
||||
|
||||
logger.init_ok("OAI Engines", status="OK")
|
||||
return engines
|
||||
else:
|
||||
# Something went wrong, print the message and quit since we can't initialize an engine
|
||||
logger.init_err("OAI Engines", status="Failed")
|
||||
logger.error(req.json())
|
||||
emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
|
||||
return []
|
||||
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self.tokenizer = self._get_tokenizer("gpt2")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
max_new: int,
|
||||
gen_settings: GenerationSettings,
|
||||
single_line: bool = False,
|
||||
batch_count: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> GenerationResult:
|
||||
|
||||
if seed is not None:
|
||||
logger.warning(
|
||||
"Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored."
|
||||
)
|
||||
|
||||
decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
|
||||
|
||||
# Store context in memory to use it for comparison with generated content
|
||||
utils.koboldai_vars.lastctx = decoded_prompt
|
||||
|
||||
# Build request JSON data
|
||||
# GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
|
||||
# as the koboldai_vars.model will always be OAI
|
||||
if "GooseAI" in utils.koboldai_vars.configname:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_a": gen_settings.top_a,
|
||||
"top_p": gen_settings.top_p,
|
||||
"top_k": gen_settings.top_k,
|
||||
"tfs": gen_settings.tfs,
|
||||
"typical_p": gen_settings.typical,
|
||||
"repetition_penalty": gen_settings.rep_pen,
|
||||
"repetition_penalty_slope": gen_settings.rep_pen_slope,
|
||||
"repetition_penalty_range": gen_settings.rep_pen_range,
|
||||
"n": batch_count,
|
||||
# TODO: Implement streaming
|
||||
"stream": False,
|
||||
}
|
||||
else:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_p": gen_settings.top_p,
|
||||
"frequency_penalty": gen_settings.rep_pen,
|
||||
"n": batch_count,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
req = requests.post(
|
||||
self.url,
|
||||
json=reqdata,
|
||||
headers={
|
||||
"Authorization": "Bearer " + self.key,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
j = req.json()
|
||||
|
||||
if not req.ok:
|
||||
# Send error message to web client
|
||||
if "error" in j:
|
||||
error_type = j["error"]["type"]
|
||||
error_message = j["error"]["message"]
|
||||
else:
|
||||
error_type = "Unknown"
|
||||
error_message = "Unknown"
|
||||
raise OpenAIAPIError(error_type, error_message)
|
||||
|
||||
outputs = [out["text"] for out in j["choices"]]
|
||||
return GenerationResult(
|
||||
model=self,
|
||||
out_batches=np.array([self.tokenizer.encode(x) for x in outputs]),
|
||||
prompt=prompt_tokens,
|
||||
is_whole_generation=True,
|
||||
single_line=single_line,
|
||||
)
|
Reference in New Issue
Block a user