Merge branch 'united' into merge/united-exllama

This commit is contained in:
Llama
2023-08-28 09:32:19 -07:00
28 changed files with 1799 additions and 894 deletions

View File

@@ -2,31 +2,13 @@
(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
#### Installation
In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
For Nvidia users everything is automatically installed when you install the requirements, you merely need a compatible GPTQ model for it to show up.
Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
`cd KoboldAI`
Next step, (Windows) subfolder mode or B: option doesn't matter choose either
* [if on Windows]
```
install_requirements.bat
```
* if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
* [if on Linux with Nvidia]
```
./install_requirements.sh
```
* [if on Linux with AMD]
```
./install_requirements.sh rocm
./commandline-rocm.sh
pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa
```
* If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
* If you get CUDA_HOME envar is not set run in env:
@@ -46,5 +28,5 @@ If you haven't done so already, exit the command prompt/leave KAI's conda env. (
Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
Switch to UI2, then load your model.
Load your model using Huggingface GPTQ as the backend option (This will show up when a valid GPTQ model is detected).

View File

@@ -12,6 +12,8 @@ import random
import shutil
import eventlet
from modeling.inference_model import GenerationMode
eventlet.monkey_patch(all=True, thread=False, os=False)
import os, inspect, contextlib, pickle
os.system("")
@@ -71,6 +73,12 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForToken
import transformers
import ipaddress
from functools import wraps
from modeling.pickling import RestrictedUnpickler, use_custom_unpickler
# Make settings folder early so we can depend on it anywhere
if not os.path.exists("settings/"):
os.mkdir("settings")
try:
from transformers.models.opt.modeling_opt import OPTDecoder
except:
@@ -630,7 +638,10 @@ model_backends = {}
model_backend_module_names = {}
model_backend_type_crosswalk = {}
PRIORITIZED_BACKEND_MODULES = ["generic_hf_torch"]
PRIORITIZED_BACKEND_MODULES = {
"gptq_hf_torch": 2,
"generic_hf_torch": 1
}
for module in os.listdir("./modeling/inference_models"):
if module == '__pycache__':
@@ -666,10 +677,15 @@ for module in os.listdir("./modeling/inference_models"):
model_backend_module_names[backend_name] = module
if backend_type in model_backend_type_crosswalk:
if module in PRIORITIZED_BACKEND_MODULES:
model_backend_type_crosswalk[backend_type].insert(0, backend_name)
else:
model_backend_type_crosswalk[backend_type].append(backend_name)
model_backend_type_crosswalk[backend_type].append(backend_name)
model_backend_type_crosswalk[backend_type] = list(sorted(
model_backend_type_crosswalk[backend_type],
key=lambda name: PRIORITIZED_BACKEND_MODULES.get(
[mod for b_name, mod in model_backend_module_names.items() if b_name == name][0],
0
),
reverse=True
))
else:
model_backend_type_crosswalk[backend_type] = [backend_name]
@@ -892,7 +908,7 @@ tags = [
api_version = None # This gets set automatically so don't change this value
api_v1 = KoboldAPISpec(
version="1.2.2",
version="1.2.3",
prefixes=["/api/v1", "/api/latest"],
tags=tags,
)
@@ -1670,75 +1686,7 @@ def unload_model():
#Reload our badwords
koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
class RestrictedUnpickler(pickle.Unpickler):
def original_persistent_load(self, saved_id):
return super().persistent_load(saved_id)
def forced_persistent_load(self, saved_id):
if saved_id[0] != "storage":
raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'")
return self.original_persistent_load(saved_id)
def find_class(self, module, name):
if module == "collections" and name == "OrderedDict":
return collections.OrderedDict
elif module == "torch._utils" and name == "_rebuild_tensor_v2":
return torch._utils._rebuild_tensor_v2
elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
return torch._tensor._rebuild_from_type_v2
elif module == "torch" and name in (
"DoubleStorage",
"FloatStorage",
"HalfStorage",
"LongStorage",
"IntStorage",
"ShortStorage",
"CharStorage",
"ByteStorage",
"BoolStorage",
"BFloat16Storage",
"Tensor",
):
return getattr(torch, name)
elif module == "numpy.core.multiarray" and name == "scalar":
return np.core.multiarray.scalar
elif module == "numpy" and name == "dtype":
return np.dtype
elif module == "_codecs" and name == "encode":
return _codecs.encode
else:
# Forbid everything else.
qualified_name = name if module == "__builtin__" else f"{module}.{name}"
raise pickle.UnpicklingError(
f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}"
)
def load(self, *args, **kwargs):
self.original_persistent_load = getattr(
self, "persistent_load", pickle.Unpickler.persistent_load
)
self.persistent_load = self.forced_persistent_load
return super().load(*args, **kwargs)
@contextlib.contextmanager
def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler):
try:
old_unpickler = pickle.Unpickler
pickle.Unpickler = unpickler
old_pickle_load = pickle.load
def new_pickle_load(*args, **kwargs):
return pickle.Unpickler(*args, **kwargs).load()
pickle.load = new_pickle_load
yield
finally:
pickle.Unpickler = old_unpickler
pickle.load = old_pickle_load
def load_model(model_backend, initial_load=False):
global model
global tokenizer
@@ -1747,9 +1695,6 @@ def load_model(model_backend, initial_load=False):
koboldai_vars.aibusy = True
koboldai_vars.horde_share = False
if initial_load:
use_breakmodel_args = True
koboldai_vars.reset_model()
koboldai_vars.noai = False
@@ -1788,7 +1733,9 @@ def load_model(model_backend, initial_load=False):
with use_custom_unpickler(RestrictedUnpickler):
model = model_backends[model_backend]
koboldai_vars.supported_gen_modes = [x.value for x in model.get_supported_gen_modes()]
model.load(initial_load=initial_load, save_model=not (args.colab or args.cacheonly) or args.savemodel)
koboldai_vars.model = model.model_name if "model_name" in vars(model) else model.id #Should have model_name, but it could be set to id depending on how it's setup
if koboldai_vars.model in ("NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"):
koboldai_vars.model = os.path.basename(os.path.normpath(model.path))
@@ -1889,8 +1836,8 @@ def load_model(model_backend, initial_load=False):
os.mkdir("./softprompts")
koboldai_vars.splist = [[f, get_softprompt_desc(os.path.join("./softprompts", f),None,True)] for f in os.listdir("./softprompts") if os.path.isfile(os.path.join("./softprompts", f)) and valid_softprompt(os.path.join("./softprompts", f))]
if initial_load and koboldai_vars.cloudflare_link != "":
logger.message(f"KoboldAI has finished loading and is available at the following link for UI 1: {koboldai_vars.cloudflare_link}")
logger.message(f"KoboldAI has finished loading and is available at the following link for UI 2: {koboldai_vars.cloudflare_link}/new_ui")
logger.message(f"KoboldAI has finished loading and is available at the following link: {koboldai_vars.cloudflare_link}")
logger.message(f"KoboldAI has finished loading and is available at the following link for the Classic UI: {koboldai_vars.cloudflare_link}/classic")
logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api")
@@ -1922,8 +1869,7 @@ def require_allowed_ip(func):
# Set up Flask routes
@app.route('/')
@app.route('/index')
@app.route('/classic')
@require_allowed_ip
def index():
if args.no_ui:
@@ -3267,11 +3213,20 @@ def check_for_backend_compilation():
break
koboldai_vars.checking = False
def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, disable_recentrng=False, no_generate=False, ignore_aibusy=False):
def actionsubmit(
data,
actionmode=0,
force_submit=False,
force_prompt_gen=False,
disable_recentrng=False,
no_generate=False,
ignore_aibusy=False,
gen_mode=GenerationMode.STANDARD
):
# Ignore new submissions if the AI is currently busy
if(koboldai_vars.aibusy):
if koboldai_vars.aibusy and not ignore_aibusy:
return
while(True):
set_aibusy(1)
koboldai_vars.actions.clear_unused_options()
@@ -3359,7 +3314,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
koboldai_vars.prompt = data
# Clear the startup text from game screen
emit('from_server', {'cmd': 'updatescreen', 'gamestarted': False, 'data': 'Please wait, generating story...'}, broadcast=True, room="UI_1")
calcsubmit("") # Run the first action through the generator
calcsubmit("", gen_mode=gen_mode) # Run the first action through the generator
if(not koboldai_vars.abort and koboldai_vars.lua_koboldbridge.restart_sequence is not None and len(koboldai_vars.genseqs) == 0):
data = ""
force_submit = True
@@ -3425,7 +3380,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
if(not no_generate and not koboldai_vars.noai and koboldai_vars.lua_koboldbridge.generating):
# Off to the tokenizer!
calcsubmit("")
calcsubmit("", gen_mode=gen_mode)
if(not koboldai_vars.abort and koboldai_vars.lua_koboldbridge.restart_sequence is not None and len(koboldai_vars.genseqs) == 0):
data = ""
force_submit = True
@@ -3780,7 +3735,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
#==================================================================#
# Take submitted text and build the text to be given to generator
#==================================================================#
def calcsubmit(txt):
def calcsubmit(txt, gen_mode=GenerationMode.STANDARD):
anotetxt = "" # Placeholder for Author's Note text
forceanote = False # In case we don't have enough actions to hit A.N. depth
anoteadded = False # In case our budget runs out before we hit A.N. depth
@@ -3822,7 +3777,7 @@ def calcsubmit(txt):
logger.debug("Submit: experimental_features time {}s".format(time.time()-start_time))
start_time = time.time()
generate(subtxt, min, max, found_entries)
generate(subtxt, min, max, found_entries, gen_mode=gen_mode)
logger.debug("Submit: generate time {}s".format(time.time()-start_time))
attention_bias.attention_bias = None
@@ -3890,7 +3845,14 @@ class HordeException(Exception):
# Send text to generator and deal with output
#==================================================================#
def generate(txt, minimum, maximum, found_entries=None):
def generate(txt, minimum, maximum, found_entries=None, gen_mode=GenerationMode.STANDARD):
# Open up token stream
emit("stream_tokens", True, broadcast=True, room="UI_2")
# HACK: Show options when streaming more than 1 sequence
if utils.koboldai_vars.output_streaming:
koboldai_vars.actions.show_options(koboldai_vars.numseqs > 1, force=True)
koboldai_vars.generated_tkns = 0
if(found_entries is None):
@@ -3912,7 +3874,7 @@ def generate(txt, minimum, maximum, found_entries=None):
# Submit input text to generator
try:
start_time = time.time()
genout, already_generated = tpool.execute(model.core_generate, txt, found_entries)
genout, already_generated = tpool.execute(model.core_generate, txt, found_entries, gen_mode=gen_mode)
logger.debug("Generate: core_generate time {}s".format(time.time()-start_time))
except Exception as e:
if(issubclass(type(e), lupa.LuaError)):
@@ -3927,7 +3889,10 @@ def generate(txt, minimum, maximum, found_entries=None):
emit('from_server', {'cmd': 'errmsg', 'data': 'Error occurred during generator call; please check console.'}, broadcast=True, room="UI_1")
logger.error(traceback.format_exc().replace("\033", ""))
socketio.emit("error", str(e), broadcast=True, room="UI_2")
set_aibusy(0)
# Clean up token stream
emit("stream_tokens", None, broadcast=True, room="UI_2")
return
for i in range(koboldai_vars.numseqs):
@@ -3959,7 +3924,10 @@ def generate(txt, minimum, maximum, found_entries=None):
del genout
gc.collect()
torch.cuda.empty_cache()
# Clean up token stream
emit("stream_tokens", None, broadcast=True, room="UI_2")
maybe_review_story()
set_aibusy(0)
@@ -4428,8 +4396,8 @@ def requestwi():
# and items in different folders are sorted based on the order of the folders
#==================================================================#
def stablesortwi():
mapping = {uid: index for index, uid in enumerate(koboldai_vars.wifolders_l)}
koboldai_vars.worldinfo.sort(key=lambda x: mapping[str(x["folder"])] if x["folder"] is not None else float("inf"))
mapping = {int(uid): index for index, uid in enumerate(koboldai_vars.wifolders_l)}
koboldai_vars.worldinfo.sort(key=lambda x: mapping[int(x["folder"])] if x["folder"] is not None else float("inf"))
last_folder = ...
last_wi = None
for i, wi in enumerate(koboldai_vars.worldinfo):
@@ -5134,9 +5102,13 @@ def load_story_v1(js, from_file=None):
def load_story_v2(js, from_file=None):
logger.debug("Loading V2 Story")
logger.debug("Called from {}".format(inspect.stack()[1].function))
leave_room(session['story'])
session['story'] = js['story_name']
join_room(session['story'])
new_story = js["story_name"]
# In socket context
if hasattr(request, "sid"):
leave_room(session['story'])
join_room(new_story)
session['story'] = new_story
koboldai_vars.load_story(session['story'], js)
@@ -5564,6 +5536,7 @@ def lite_html():
#==================================================================#
# UI V2 CODE
#==================================================================#
@app.route('/')
@app.route('/new_ui')
@require_allowed_ip
@logger.catch
@@ -6149,6 +6122,7 @@ def UI_2_Set_Selected_Text(data):
@socketio.on('Use Option Text')
@logger.catch
def UI_2_Use_Option_Text(data):
koboldai_vars.actions.show_options(False)
if koboldai_vars.prompt == "":
koboldai_vars.prompt = koboldai_vars.actions.get_current_options()[int(data['option'])]['text']
koboldai_vars.actions.clear_unused_options()
@@ -6169,23 +6143,31 @@ def UI_2_delete_option(data):
@socketio.on('submit')
@logger.catch
def UI_2_submit(data):
if not koboldai_vars.noai and data['theme'] != "":
if not koboldai_vars.noai and data['theme']:
# Random prompt generation
logger.debug("doing random prompt")
memory = koboldai_vars.memory
koboldai_vars.memory = "{}\n\nYou generate the following {} story concept :".format(koboldai_vars.memory, data['theme'])
koboldai_vars.lua_koboldbridge.feedback = None
actionsubmit("", force_submit=True, force_prompt_gen=True)
koboldai_vars.memory = memory
else:
logger.debug("doing normal input")
koboldai_vars.actions.clear_unused_options()
koboldai_vars.lua_koboldbridge.feedback = None
koboldai_vars.recentrng = koboldai_vars.recentrngm = None
if koboldai_vars.actions.action_count == -1:
actionsubmit(data['data'], actionmode=koboldai_vars.actionmode)
else:
actionsubmit(data['data'], actionmode=koboldai_vars.actionmode)
return
logger.debug("doing normal input")
koboldai_vars.actions.clear_unused_options()
koboldai_vars.lua_koboldbridge.feedback = None
koboldai_vars.recentrng = koboldai_vars.recentrngm = None
gen_mode_name = data.get("gen_mode", None) or "standard"
try:
gen_mode = GenerationMode(gen_mode_name)
except ValueError:
# Invalid enum lookup!
gen_mode = GenerationMode.STANDARD
logger.warning(f"Unknown gen_mode '{gen_mode_name}', using STANDARD! Report this!")
actionsubmit(data['data'], actionmode=koboldai_vars.actionmode, gen_mode=gen_mode)
#==================================================================#
# Event triggered when user clicks the submit button
#==================================================================#
@@ -6279,7 +6261,7 @@ def UI_2_select_model(data):
#so we'll just go through all the possible loaders
for model_backend in sorted(
model_backends,
key=lambda x: model_backend_module_names[x] in PRIORITIZED_BACKEND_MODULES,
key=lambda x: PRIORITIZED_BACKEND_MODULES.get(model_backend_module_names[x], 0),
reverse=True,
):
if model_backends[model_backend].is_valid(data["name"], data["path"] if 'path' in data else None, data["menu"]):
@@ -6715,11 +6697,18 @@ def UI_2_set_wi_image(uid):
except FileNotFoundError:
pass
else:
# Otherwise assign image
with open(path, "wb") as file:
file.write(data)
try:
# Otherwise assign image
with open(path, "wb") as file:
file.write(data)
except FileNotFoundError:
show_error_notification(
"Unable to write image",
"Please save the game before uploading images."
)
return ":(", 500
koboldai_vars.gamesaved = False
return ":)"
return ":)", 200
@app.route("/get_wi_image/<int(signed=True):uid>", methods=["GET"])
@require_allowed_ip
@@ -7336,7 +7325,7 @@ def generate_image(prompt: str) -> Optional[Image.Image]:
if koboldai_vars.img_gen_priority == 4:
# Check if stable-diffusion-webui API option selected and use that if found.
return text2img_api(prompt)
elif ((not koboldai_vars.hascuda or not os.path.exists("models/stable-diffusion-v1-4")) and koboldai_vars.img_gen_priority != 0) or koboldai_vars.img_gen_priority == 3:
elif ((not koboldai_vars.hascuda or not os.path.exists("functional_models/stable-diffusion")) and koboldai_vars.img_gen_priority != 0) or koboldai_vars.img_gen_priority == 3:
# If we don't have a GPU, use horde if we're allowed to
return text2img_horde(prompt)
@@ -7362,7 +7351,7 @@ def text2img_local(prompt: str) -> Optional[Image.Image]:
logger.debug("Generating Image")
from diffusers import StableDiffusionPipeline
if koboldai_vars.image_pipeline is None:
pipe = tpool.execute(StableDiffusionPipeline.from_pretrained, "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, cache="functional_models/stable-diffusion").to("cuda")
pipe = tpool.execute(StableDiffusionPipeline.from_pretrained, "XpucT/Deliberate", safety_checker=None, torch_dtype=torch.float16, cache="functional_models/stable-diffusion").to("cuda")
else:
pipe = koboldai_vars.image_pipeline.to("cuda")
logger.debug("time to load: {}".format(time.time() - start_time))
@@ -7784,9 +7773,16 @@ def UI_2_update_tokens(data):
def UI_2_privacy_mode(data):
if data['enabled']:
koboldai_vars.privacy_mode = True
return
if data['password'] == koboldai_vars.privacy_password:
koboldai_vars.privacy_mode = False
else:
if data['password'] == koboldai_vars.privacy_password:
koboldai_vars.privacy_mode = False
logger.warning("Watch out! Someone tried to unlock your instance with an incorrect password! Stay on your toes...")
show_error_notification(
title="Invalid password",
text="The password you provided was incorrect. Please try again."
)
#==================================================================#
# Genres
@@ -8236,6 +8232,7 @@ class WorldInfoUIDsSchema(WorldInfoEntriesUIDsSchema):
class ModelSelectionSchema(KoboldSchema):
model: str = fields.String(required=True, validate=validate.Regexp(r"^(?!\s*NeoCustom)(?!\s*GPT2Custom)(?!\s*TPUMeshTransformerGPTJ)(?!\s*TPUMeshTransformerGPTNeoX)(?!\s*GooseAI)(?!\s*OAI)(?!\s*InferKit)(?!\s*Colab)(?!\s*API).*$"), metadata={"description": 'Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model'})
backend: Optional[str] = fields.String(required=False, validate=validate.OneOf(model_backends.keys()))
def _generate_text(body: GenerationInputSchema):
if koboldai_vars.aibusy or koboldai_vars.genseqs:
@@ -8493,6 +8490,7 @@ def put_model(body: ModelSelectionSchema):
summary: Load a model
description: |-2
Loads a model given its Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model.
Optionally, a backend parameter can be passed in to dictate which backend loads the model.
tags:
- model
requestBody:
@@ -8502,6 +8500,7 @@ def put_model(body: ModelSelectionSchema):
schema: ModelSelectionSchema
example:
model: ReadOnly
backend: Read Only
responses:
200:
description: Successful request
@@ -8519,8 +8518,18 @@ def put_model(body: ModelSelectionSchema):
set_aibusy(1)
old_model = koboldai_vars.model
koboldai_vars.model = body.model.strip()
backend = getattr(body, "backend", None)
if not backend:
# Backend is optional for backwards compatibility; it should probably be
# required on the next major API version.
if body.model == "ReadOnly":
backend = "Read Only"
else:
backend = "Huggingface"
try:
load_model(use_breakmodel_args=True, breakmodel_args_default_to_cpu=True)
load_model(backend)
except Exception as e:
koboldai_vars.model = old_model
raise e
@@ -8808,8 +8817,14 @@ def get_story():
chunks = []
if koboldai_vars.gamestarted:
chunks.append({"num": 0, "text": koboldai_vars.prompt})
for num, action in koboldai_vars.actions.items():
chunks.append({"num": num + 1, "text": action})
last_action_num = list(koboldai_vars.actions.actions.keys())[-1]
for num, action in koboldai_vars.actions.actions.items():
text = action["Selected Text"]
# The last action seems to always be empty
if not text and num == last_action_num:
continue
chunks.append({"num": num + 1, "text": text})
return {"results": chunks}
@@ -8833,7 +8848,7 @@ def get_story_nums():
chunks = []
if koboldai_vars.gamestarted:
chunks.append(0)
for num in koboldai_vars.actions.keys():
for num in koboldai_vars.actions.actions.keys():
chunks.append(num + 1)
return {"results": chunks}
@@ -9194,7 +9209,7 @@ def get_world_info():
if wi["folder"] != last_folder:
folder = []
if wi["folder"] is not None:
folders.append({"uid": wi["folder"], "name": koboldai_vars.wifolders_d[wi["folder"]]["name"], "entries": folder})
folders.append({"uid": wi["folder"], "name": koboldai_vars.wifolders_d[str(wi["folder"])]["name"], "entries": folder})
last_folder = wi["folder"]
(folder if wi["folder"] is not None else entries).append({k: v for k, v in wi.items() if k not in ("init", "folder", "num") and (wi["selective"] or k != "keysecondary")})
return {"folders": folders, "entries": entries}
@@ -10905,8 +10920,8 @@ def run():
if not koboldai_vars.use_colab_tpu and args.model:
# If we're using a TPU our UI will freeze during the connection to the TPU. To prevent this from showing to the user we
# delay the display of this message until after that step
logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}")
logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui")
logger.message(f"KoboldAI is still loading your model but available at the following link: {cloudflare}")
logger.message(f"KoboldAI is still loading your model but available at the following link for the Classic UI: {cloudflare}/classic")
logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite")
logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Loading Model...]")
logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.")

View File

@@ -80,7 +80,7 @@
"#@title <b><-- Select your model below and then click this to start KoboldAI</b>\n",
"#@markdown You can find a description of the models below along with instructions on how to start KoboldAI.\n",
"\n",
"Model = \"Nerys V2 6B\" #@param [\"Nerys V2 6B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Nerys 2.7B\", \"AID 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"OPT 2.7B\", \"Fairseq Dense 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n",
"Model = \"Nerys V2 6B\" #@param [\"MythoMax 13B (United)\", \"Huginn 13B (United)\", \"Chronos 13B (United)\", \"Airoboros M2.0 13B (United)\", \"Holodeck 13B (United)\", \"Spring Dragon 13B (United)\", \"Nerys V2 6B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Nerys 2.7B\", \"AID 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"OPT 2.7B\", \"Fairseq Dense 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n",
"Version = \"Official\" #@param [\"Official\", \"United\"] {allow-input: true}\n",
"Provider = \"Cloudflare\" #@param [\"Localtunnel\", \"Cloudflare\"]\n",
"use_google_drive = True #@param {type:\"boolean\"}\n",
@@ -146,6 +146,36 @@
" Model = \"EleutherAI/gpt-neo-2.7B\"\n",
" path = \"\"\n",
" download = \"\"\n",
"elif Model == \"Huginn 13B (United)\":\n",
" Model = \"The-Face-Of-Goonery/Huginn-13b-v1.2\"\n",
" path = \"\"\n",
" download = \"\"\n",
" Version = \"United\"\n",
"elif Model == \"Chronos 13B (United)\":\n",
" Model = \"elinas/chronos-13b-v2\"\n",
" path = \"\"\n",
" download = \"\"\n",
" Version = \"United\"\n",
"elif Model == \"Airoboros M2.0 13B (United)\":\n",
" Model = \"jondurbin/airoboros-l2-13b-gpt4-m2.0\"\n",
" path = \"\"\n",
" download = \"\"\n",
" Version = \"United\"\n",
"elif Model == \"MythoMax 13B (United)\":\n",
" Model = \"Gryphe/MythoMax-L2-13b\"\n",
" path = \"\"\n",
" download = \"\"\n",
" Version = \"United\"\n",
"elif Model == \"Spring Dragon 13B (United)\":\n",
" Model = \"Henk717/spring-dragon\"\n",
" path = \"\"\n",
" download = \"\"\n",
" Version = \"United\"\n",
"elif Model == \"Holodeck 13B (United)\":\n",
" Model = \"KoboldAI/LLAMA2-13B-Holodeck-1\"\n",
" path = \"\"\n",
" download = \"\"\n",
" Version = \"United\"\n",
"\n",
"if Provider == \"Localtunnel\":\n",
" tunnel = \"--localtunnel yes\"\n",
@@ -193,6 +223,20 @@
"metadata": {
"id": "Lrm840I33hkC"
}
},
{
"cell_type": "code",
"source": [
"#@title <b>Model Cleaner</b>\n",
"#@markdown Out of space? Run this to remove all cached models (Google Drive models are not effected).\n",
"!rm -rf /content/KoboldAI-Client/cache/*\n"
],
"metadata": {
"cellView": "form",
"id": "5k8fK4F6UiTs"
},
"execution_count": null,
"outputs": []
}
]
}

View File

@@ -47,10 +47,10 @@ dependencies:
- pydub
- diffusers
- git+https://github.com/0cc4m/hf_bleeding_edge/
- --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html
- gptq_koboldai==0.0.6
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/henk717/KoboldAI/releases/download/Snapshot-11-08-23/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- einops
- peft==0.3.0
- scipy
- --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
- exllama==0.0.6

View File

@@ -7,7 +7,7 @@ gensettingstf = [
"min": 16,
"max": 512,
"step": 2,
"default": 80,
"default": 200,
"tooltip": "Number of tokens to be generated. Higher values will take longer to generate.",
"menu_path": "Settings",
"sub_path": "Generation",
@@ -182,9 +182,9 @@ gensettingstf = [
"label": "Context Tokens",
"id": "settknmax",
"min": 512,
"max": 2048,
"max": 4096,
"step": 8,
"default": 1024,
"default": 2048,
"tooltip": "Number of context tokens to submit to the AI for sampling. Make sure this is higher than Output Length. Higher values increase VRAM/RAM usage.",
"menu_path": "Settings",
"sub_path": "Generation",
@@ -296,7 +296,7 @@ gensettingstf = [
"max": 1,
"step": 1,
"default": 0,
"tooltip": "Scans the AI's output for World Info keys as it is generating the one.",
"tooltip": "Look for World Info keys in the AI's response while it is still being generated.",
"menu_path": "World Info",
"sub_path": "",
"classname": "story",
@@ -413,6 +413,23 @@ gensettingstf = [
,
"ui_level": 2
},
{
"UI_V2_Only": True,
"uitype": "toggle",
"unit": "bool",
"label": "Smooth Streaming",
"id": "smoothstreaming",
"min": 0,
"max": 1,
"step": 1,
"default": 0,
"tooltip": "Makes Token Streaming type in characters, not tokens. Note that this is purely visual, and will likely increase delay in seeing the tokens.",
"menu_path": "Interface",
"sub_path": "UI",
"classname": "user",
"name": "smooth_streaming",
"ui_level": 1
},
{
"uitype": "toggle",
"unit": "bool",
@@ -739,7 +756,7 @@ gensettingstf = [
"max": 1,
"step": 1,
"default": 0,
"tooltip": "If enabled, experimental features will be displayed in the UI.",
"tooltip": "If enabled, experimental features will be displayed in the UI. Note: These features have been determined to be too unstable for standard use, and may corrupt your data. You're on your own from here.",
"menu_path": "Interface",
"sub_path": "UI",
"classname": "system",

View File

@@ -6,7 +6,7 @@ import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys
import shutil
from typing import List, Union
from io import BytesIO
from flask import has_request_context, session
from flask import has_request_context, session, request
from flask_socketio import join_room, leave_room
from collections import OrderedDict
import multiprocessing
@@ -130,11 +130,14 @@ class koboldai_vars(object):
original_story_name = story_name
if not multi_story:
story_name = 'default'
#Leave the old room and join the new one
logger.debug("Leaving room {}".format(session['story']))
leave_room(session['story'])
logger.debug("Joining room {}".format(story_name))
join_room(story_name)
# Leave the old room and join the new one if in socket context
if hasattr(request, "sid"):
logger.debug("Leaving room {}".format(session['story']))
leave_room(session['story'])
logger.debug("Joining room {}".format(story_name))
join_room(story_name)
session['story'] = story_name
logger.debug("Sending story reset")
self._story_settings[story_name]._socketio.emit("reset_story", {}, broadcast=True, room=story_name)
@@ -653,7 +656,7 @@ class model_settings(settings):
'welcome', 'welcome_default', 'simple_randomness', 'simple_creativity', 'simple_repitition',
'badwordsids', 'uid_presets', 'model', 'model_type', 'lazy_load', 'fp32_model', 'modeldim', 'horde_wait_time', 'horde_queue_position', 'horde_queue_size', 'newlinemode', 'tqdm_progress', 'tqdm_rem_time', '_tqdm']
settings_name = "model"
default_settings = {"rep_pen" : 1.1, "rep_pen_slope": 0.7, "rep_pen_range": 1024, "temp": 0.5, "top_p": 0.9, "top_k": 0, "top_a": 0.0, "tfs": 1.0, "typical": 1.0,
default_settings = {"rep_pen" : 1.1, "rep_pen_slope": 1.0, "rep_pen_range": 2048, "temp": 0.5, "top_p": 0.9, "top_k": 0, "top_a": 0.0, "tfs": 1.0, "typical": 1.0,
"sampler_order": [6,0,1,2,3,4,5]}
def __init__(self, socketio, koboldai_vars):
self.enable_whitelist = False
@@ -677,7 +680,7 @@ class model_settings(settings):
<div id='welcome-logo-container'><img id='welcome-logo' src='static/Welcome_Logo.png' draggable='False'></div>
<div class='welcome_text'>
<div id="welcome-text-content">Please load a model from the left.<br/>
If you encounter any issues, please click the Download debug dump link in the Home tab on the left flyout and attach the downloaded file to your error report on <a href='https://github.com/ebolam/KoboldAI/issues'>Github</a>, <a href='https://www.reddit.com/r/KoboldAI/'>Reddit</a>, or <a href='https://discord.gg/XuQWadgU9k'>Discord</a>.
If you encounter any issues, please click the Download debug dump link in the Home tab on the left flyout and attach the downloaded file to your error report on <a href='https://github.com/ebolam/KoboldAI/issues'>Github</a>, <a href='https://www.reddit.com/r/KoboldAI/'>Reddit</a>, or <a href='https://koboldai.org/discord'>Discord</a>.
A redacted version (without story text) is available.
</div>
</div>""" # Custom Welcome Text
@@ -685,18 +688,19 @@ class model_settings(settings):
self._koboldai_vars = koboldai_vars
self.alt_multi_gen = False
self.bit_8_available = None
self.supported_gen_modes = []
def reset_for_model_load(self):
self.simple_randomness = 0 #Set first as this affects other outputs
self.simple_creativity = 0 #Set first as this affects other outputs
self.simple_repitition = 0 #Set first as this affects other outputs
self.max_length = 1024 # Maximum number of tokens to submit per action
self.max_length = 2048 # Maximum number of tokens to submit per action
self.ikmax = 3000 # Maximum number of characters to submit to InferKit
self.genamt = 80 # Amount of text for each action to generate
self.genamt = 200 # Amount of text for each action to generate
self.ikgen = 200 # Number of characters for InferKit to generate
self.rep_pen = 1.1 # Default generator repetition_penalty
self.rep_pen_slope = 0.7 # Default generator repetition penalty slope
self.rep_pen_range = 1024 # Default generator repetition penalty range
self.rep_pen_slope = 1.0 # Default generator repetition penalty slope
self.rep_pen_range = 2048 # Default generator repetition penalty range
self.temp = 0.5 # Default generator temperature
self.top_p = 0.9 # Default generator top_p
self.top_k = 0 # Default generator top_k
@@ -1155,6 +1159,7 @@ class user_settings(settings):
self.nogenmod = False
self.debug = False # If set to true, will send debug information to the client for display
self.output_streaming = True
self.smooth_streaming = True
self.show_probs = False # Whether or not to show token probabilities
self.beep_on_complete = False
self.img_gen_priority = 1
@@ -1755,11 +1760,15 @@ class KoboldStoryRegister(object):
def go_forward(self):
action_step = self.action_count+1
if action_step in self.actions:
if len(self.get_current_options()) == 1:
logger.warning("Going forward with this text: {}".format(self.get_current_options()[0]["text"]))
self.use_option([x['text'] for x in self.actions[action_step]["Options"]].index(self.get_current_options()[0]["text"]))
if action_step not in self.actions:
return
self.show_options(len(self.get_current_options()) > 1)
if len(self.get_current_options()) == 1:
logger.warning("Going forward with this text: {}".format(self.get_current_options()[0]["text"]))
self.use_option([x['text'] for x in self.actions[action_step]["Options"]].index(self.get_current_options()[0]["text"]))
def use_option(self, option_number, action_step=None):
if action_step is None:
action_step = self.action_count+1
@@ -1797,6 +1806,16 @@ class KoboldStoryRegister(object):
process_variable_changes(self._socketio, "story", 'actions', {"id": action_step, 'action': self.actions[action_step]}, None)
self.set_game_saved()
def show_options(
self,
should_show: bool,
force: bool = False,
) -> None:
if self._koboldai_vars.aibusy and not force:
return
self._socketio.emit("show_options", should_show, broadcast=True, room="UI_2")
def delete_action(self, action_id, keep=True):
if action_id in self.actions:
old_options = copy.deepcopy(self.actions[action_id]["Options"])
@@ -1889,34 +1908,19 @@ class KoboldStoryRegister(object):
process_variable_changes(self._socketio, "story", 'actions', {"id": self.action_count+1, 'action': self.actions[self.action_count+1]}, None)
else:
#We're streaming single options so our output is our selected
#First we need to see if this is actually the prompt. If so we'll just not do streaming:
if self.story_settings.prompt != "":
if self.action_count+1 in self.actions:
if self._koboldai_vars.tokenizer is not None:
selected_text_length = len(self._koboldai_vars.tokenizer.encode(self.actions[self.action_count+1]['Selected Text']))
else:
selected_text_length = 0
self.actions[self.action_count+1]['Selected Text'] = "{}{}".format(self.actions[self.action_count+1]['Selected Text'], text_list[0])
self.actions[self.action_count+1]['Selected Text Length'] = selected_text_length
else:
if self._koboldai_vars.tokenizer is not None:
selected_text_length = len(self._koboldai_vars.tokenizer.encode(text_list[0]))
else:
selected_text_length = 0
self.actions[self.action_count+1] = {"Selected Text": text_list[0], "Selected Text Length": selected_text_length, "Options": [], "Time": int(time.time())}
if self._koboldai_vars.tokenizer is not None:
if len(self._koboldai_vars.tokenizer.encode(self.actions[self.action_count+1]['Selected Text'])) != self._koboldai_vars.genamt:
#ui1
if queue is not None:
queue.put(["from_server", {"cmd": "streamtoken", "data": [{
"decoded": text_list[0],
"probabilities": self.probability_buffer
}]}, {"broadcast":True, "room":"UI_1"}])
#process_variable_changes(self._socketio, "actions", "Options", {"id": self.action_count+1, "options": self.actions[self.action_count+1]["Options"]}, {"id": self.action_count+1, "options": None})
process_variable_changes(self._socketio, "story", 'actions', {"id": self.action_count+1, 'action': self.actions[self.action_count+1]}, None)
queue.put(["stream_tokens", text_list, {"broadcast": True, "room": "UI_2"}])
# UI1
queue.put([
"from_server", {
"cmd": "streamtoken",
"data": [{
"decoded": text_list[0],
"probabilities": self.probability_buffer
}],
},
{"broadcast":True, "room": "UI_1"}
])
def set_probabilities(self, probabilities, action_id=None):
self.probability_buffer = probabilities

View File

@@ -3,6 +3,8 @@ from __future__ import annotations
from dataclasses import dataclass
import time
from typing import List, Optional, Union
from enum import Enum
from logger import logger
import torch
@@ -12,6 +14,7 @@ from transformers import (
GPT2Tokenizer,
AutoTokenizer,
)
from modeling.stoppers import Stoppers
from modeling.tokenizer import GenericTokenizer
from modeling import logits_processors
@@ -144,7 +147,10 @@ class GenerationSettings:
class ModelCapabilities:
embedding_manipulation: bool = False
post_token_hooks: bool = False
# Used to gauge if manual stopping is possible
stopper_hooks: bool = False
# TODO: Support non-live probabilities from APIs
post_token_probs: bool = False
@@ -154,6 +160,12 @@ class ModelCapabilities:
# Some models need to warm up the TPU before use
uses_tpu: bool = False
class GenerationMode(Enum):
STANDARD = "standard"
FOREVER = "forever"
UNTIL_EOS = "until_eos"
UNTIL_NEWLINE = "until_newline"
UNTIL_SENTENCE_END = "until_sentence_end"
class InferenceModel:
"""Root class for all models."""
@@ -256,6 +268,7 @@ class InferenceModel:
self,
text: list,
found_entries: set,
gen_mode: GenerationMode = GenerationMode.STANDARD,
):
"""Generate story text. Heavily tied to story-specific parameters; if
you are making a new generation-based feature, consider `generate_raw()`.
@@ -263,6 +276,7 @@ class InferenceModel:
Args:
text (list): Encoded input tokens
found_entries (set): Entries found for Dynamic WI
gen_mode (GenerationMode): The GenerationMode to pass to raw_generate. Defaults to GenerationMode.STANDARD
Raises:
RuntimeError: if inconsistancies are detected with the internal state and Lua state -- sanity check
@@ -358,6 +372,7 @@ class InferenceModel:
seed=utils.koboldai_vars.seed
if utils.koboldai_vars.full_determinism
else None,
gen_mode=gen_mode
)
logger.debug(
"core_generate: run raw_generate pass {} {}s".format(
@@ -532,6 +547,7 @@ class InferenceModel:
found_entries: set = (),
tpu_dynamic_inference: bool = False,
seed: Optional[int] = None,
gen_mode: GenerationMode = GenerationMode.STANDARD,
**kwargs,
) -> GenerationResult:
"""A wrapper around `_raw_generate()` that handles gen_state and other stuff. Use this to generate text outside of the story.
@@ -547,6 +563,7 @@ class InferenceModel:
is_core (bool, optional): Whether this generation is a core story generation. Defaults to False.
single_line (bool, optional): Generate one line only.. Defaults to False.
found_entries (set, optional): Entries found for Dynamic WI. Defaults to ().
gen_mode (GenerationMode): Special generation mode. Defaults to GenerationMode.STANDARD.
Raises:
ValueError: If prompt type is weird
@@ -568,6 +585,29 @@ class InferenceModel:
"wi_scanner_excluded_keys", set()
)
self.gen_state["allow_eos"] = False
temp_stoppers = []
if gen_mode not in self.get_supported_gen_modes():
gen_mode = GenerationMode.STANDARD
logger.warning(f"User requested unsupported GenerationMode '{gen_mode}'!")
if gen_mode == GenerationMode.FOREVER:
self.gen_state["stop_at_genamt"] = False
max_new = 1e7
elif gen_mode == GenerationMode.UNTIL_EOS:
self.gen_state["allow_eos"] = True
self.gen_state["stop_at_genamt"] = False
max_new = 1e7
elif gen_mode == GenerationMode.UNTIL_NEWLINE:
# TODO: Look into replacing `single_line` with `generation_mode`
temp_stoppers.append(Stoppers.newline_stopper)
elif gen_mode == GenerationMode.UNTIL_SENTENCE_END:
temp_stoppers.append(Stoppers.sentence_end_stopper)
self.stopper_hooks += temp_stoppers
utils.koboldai_vars.inference_config.do_core = is_core
gen_settings = GenerationSettings(*(generation_settings or {}))
@@ -597,13 +637,21 @@ class InferenceModel:
)
time_end = round(time.time() - time_start, 2)
tokens_per_second = round(len(result.encoded[0]) / time_end, 2)
try:
tokens_per_second = round(len(result.encoded[0]) / time_end, 2)
except ZeroDivisionError:
# Introducing KoboldAI's fastest model: ReadOnly!
tokens_per_second = 0
if not utils.koboldai_vars.quiet:
logger.info(
f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second."
)
for stopper in temp_stoppers:
self.stopper_hooks.remove(stopper)
return result
def generate(
@@ -620,3 +668,19 @@ class InferenceModel:
def _post_token_gen(self, input_ids: torch.LongTensor) -> None:
for hook in self.post_token_hooks:
hook(self, input_ids)
def get_supported_gen_modes(self) -> List[GenerationMode]:
"""Returns a list of compatible `GenerationMode`s for the current model.
Returns:
List[GenerationMode]: A list of compatible `GenerationMode`s.
"""
ret = [GenerationMode.STANDARD]
if self.capabilties.stopper_hooks:
ret += [
GenerationMode.FOREVER,
GenerationMode.UNTIL_NEWLINE,
GenerationMode.UNTIL_SENTENCE_END,
]
return ret

View File

@@ -27,6 +27,10 @@ model_backend_name = "Huggingface"
model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
class model_backend(HFTorchInferenceModel):
def __init__(self) -> None:
super().__init__()
self.use_4_bit = False
def is_valid(self, model_name, model_path, menu_path):
base_is_valid = super().is_valid(model_name, model_path, menu_path)
path = False
@@ -58,15 +62,15 @@ class model_backend(HFTorchInferenceModel):
"unit": "text",
"label": "Quantization",
"id": "quantization",
"default": temp['quantization'] if 'quantization' in temp else 'none',
"default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
"tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
"menu_path": "Layers",
"children": [{'text': 'None', 'value':'none'},{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}],
"children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
"extra_classes": "",
"refresh_model_inputs": False
})
else:
logger.warning("Bitsandbytes is not installed, you can not use Huggingface models in 4-bit")
logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
return requested_parameters
def set_input_parameters(self, parameters):
@@ -124,7 +128,8 @@ class model_backend(HFTorchInferenceModel):
# We must disable low_cpu_mem_usage and if using a GPT-2 model
# because GPT-2 is not compatible with this feature yet.
tf_kwargs.pop("low_cpu_mem_usage", None)
tf_kwargs.pop("quantization_config", None)
# Also, lazy loader doesn't support GPT-2 models
self.lazy_load = False

View File

@@ -7,7 +7,7 @@ import torch
import re
import shutil
import sys
from typing import Union
from typing import Dict, Union
import utils
import modeling.lazy_loader as lazy_loader
@@ -82,13 +82,109 @@ def get_gptq_version(fpath):
logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
return 0, False
def load_quant_offload_device_map(
load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False,
):
from gptq.offload import (
find_layers,
llama_offload_forward,
gptneox_offload_forward,
gptj_offload_forward,
opt_offload_forward,
bigcode_offload_forward
)
from transformers.models.llama.modeling_llama import LlamaModel
from transformers.models.opt.modeling_opt import OPTModel
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel
from transformers.models.gptj.modeling_gptj import GPTJModel
from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)
m, layers, remaining = find_layers(model)
type(m).non_offload_forward = type(m).forward
# Hook offload_forward into found model
if type(m) == LlamaModel:
type(m).forward = llama_offload_forward
elif type(m) == GPTNeoXModel:
type(m).forward = gptneox_offload_forward
elif type(m) == GPTJModel:
type(m).forward = gptj_offload_forward
elif type(m) == OPTModel:
type(m).forward = opt_offload_forward
elif type(m) == GPTBigCodeModel:
type(m).forward = bigcode_offload_forward
else:
raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")
layers_done = len([1 for v in device_map.values() if v != "cpu"])
m.cpu_device = torch.device("cpu")
m.fast_offload = layers_done > len(layers) // 2
m.layer_count = len(layers)
m.cpu_layers = len(layers) - layers_done
m.gpu_layers = layers_done
m.offload_type = offload_type
# HACK
m.primary_gpu = list(device_map.values())[0]
if "layers" not in dir(m):
m.layers = layers
for i in range(len(layers)):
dev = None
for key, device in device_map.items():
key = int(*[x for x in key.split(".") if x.isdecimal()])
if key == i:
dev = device
break
if dev is None:
raise ValueError
layers[key].to(dev, torch.float16, False)
for module in remaining:
module.to(m.primary_gpu)
return model
class model_backend(HFTorchInferenceModel):
def is_valid(self, model_name, model_path, menu_path):
gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
return bool(gptq_model)
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
requested_parameters = super().get_requested_parameters(model_name, model_path, menu_path, parameters)
if model_name != 'customhuggingface' or "custom_model_name" in parameters:
if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
temp = json.load(f)
else:
temp = {}
requested_parameters.append({
"uitype": "dropdown",
"unit": "text",
"label": "Implementation",
"id": "implementation",
"default": temp['implementation'] if 'implementation' in temp else 'occam',
"tooltip": "Which GPTQ provider to use?",
"menu_path": "Layers",
"children": [{'text': 'Occam GPTQ', 'value': 'occam'}, {'text': 'AutoGPTQ', 'value': 'AutoGPTQ'}],
"extra_classes": "",
"refresh_model_inputs": False
})
return requested_parameters
def set_input_parameters(self, parameters):
super().set_input_parameters(parameters)
self.implementation = parameters['implementation'] if 'implementation' in parameters else "occam"
def _load(self, save_model: bool, initial_load: bool) -> None:
try:
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
# Make model path the same as the model name to make this consistent
# with the other loading method if it isn't a known model type. This
# code is not just a workaround for below, it is also used to make the
@@ -98,7 +194,7 @@ class model_backend(HFTorchInferenceModel):
self.init_model_config()
self.lazy_load = False
self.lazy_load = True
gpulayers = self.breakmodel_config.gpu_blocks
@@ -107,10 +203,6 @@ class model_backend(HFTorchInferenceModel):
except (ValueError, AttributeError):
self.gpu_layers_list = [utils.num_layers(self.model_config)]
tf_kwargs = {
"low_cpu_mem_usage": True,
}
# If we're using torch_lazy_loader, we need to get breakmodel config
# early so that it knows where to load the individual model tensors
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
@@ -123,9 +215,6 @@ class model_backend(HFTorchInferenceModel):
self.breakmodel_device_config(self.model_config)
if self.lazy_load:
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
tf_kwargs.pop("low_cpu_mem_usage", None)
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
with lazy_loader.use_lazy_load(dematerialized_modules=True):
try:
@@ -141,7 +230,7 @@ class model_backend(HFTorchInferenceModel):
if self.get_local_model_path():
# Model is stored locally, load it.
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
self.model = self._get_model(self.get_local_model_path())
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
else:
raise NotImplementedError("GPTQ Model downloading not implemented")
@@ -161,7 +250,58 @@ class model_backend(HFTorchInferenceModel):
self.model.kai_model = self
utils.koboldai_vars.modeldim = self.get_hidden_size()
def _get_model(self, location: str, tf_kwargs: Dict):
def _patch_quant(self, device_map, quant_module) -> None:
def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs):
if isinstance(module, quant_module.QuantLinear):
return
for attr in dir(module):
tmp = getattr(module, attr)
name1 = name + '.' + attr if name != '' else attr
if name1 in names:
parts = name1.split(".")
device = None
for i in reversed(range(len(parts))):
maybe_key = ".".join(parts[:i])
if maybe_key in device_map:
device = device_map[maybe_key]
break
if device is None:
raise ValueError(f"No device for {name1}")
delattr(module, attr)
ql = quant_module.QuantLinear(
bits,
groupsize,
tmp.in_features,
tmp.out_features,
force_bias or tmp.bias is not None,
**kwargs,
)
ql = ql.to(device)
setattr(module, attr, ql)
for name1, child in module.named_children():
make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)
quant_module.make_quant = make_quant
def _patch_quants(self, device_map) -> None:
# Load QuantLinears on the device corresponding to the device map
from gptq import quant_v3
from gptq import quant_v2
from gptq import quant_v1
for quant_module in [quant_v3, quant_v2, quant_v1]:
self._patch_quant(device_map, quant_module)
def _get_model(self, location: str):
import gptq
from gptq.gptj import load_quant as gptj_load_quant
from gptq.gptneox import load_quant as gptneox_load_quant
@@ -169,7 +309,12 @@ class model_backend(HFTorchInferenceModel):
from gptq.opt import load_quant as opt_load_quant
from gptq.bigcode import load_quant as bigcode_load_quant
from gptq.mpt import load_quant as mpt_load_quant
from gptq.offload import load_quant_offload
try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
v2_bias = False
@@ -181,50 +326,77 @@ class model_backend(HFTorchInferenceModel):
model_type = self.get_model_type()
logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
if model_type == "gptj":
model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_neox":
model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "llama":
model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "opt":
model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "mpt":
model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_bigcode":
model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
else:
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
device_map = {}
# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
if self.lazy_load:
with lazy_loader.use_lazy_load(dematerialized_modules=True):
metamodel = AutoModelForCausalLM.from_config(self.model_config)
if utils.args.cpu:
device_map = {name: "cpu" for name in utils.layers_module_names}
for name in utils.get_missing_module_names(
metamodel, list(device_map.keys())
):
device_map[name] = "cpu"
else:
device_map = self.breakmodel_config.get_device_map(
metamodel
)
# Patch in embeddings function
def get_input_embeddings(self):
return self.model.get_input_embeddings()
self._patch_quants(device_map)
type(model).get_input_embeddings = get_input_embeddings
with lazy_loader.use_lazy_load(
enable=self.lazy_load,
dematerialized_modules=False,
):
if self.implementation == "occam":
try:
if model_type == "gptj":
model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "gpt_neox":
model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "llama":
model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "opt":
model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_tseype == "mpt":
model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "gpt_bigcode":
model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half()
else:
raise RuntimeError("Model not supported by Occam's GPTQ")
except:
self.implementation = "AutoGPTQ"
if self.implementation == "AutoGPTQ":
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
# Patch in args support..
def generate(self, *args, **kwargs):
"""shortcut for model.generate"""
with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
return self.model.generate(*args, **kwargs)
# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
type(model).generate = generate
try:
model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"), device_map=device_map)
except:
model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"), device_map=device_map, disable_exllama=True)
# Patch in embeddings function
def get_input_embeddings(self):
return self.model.get_input_embeddings()
type(model).get_input_embeddings = get_input_embeddings
# Patch in args support..
def generate(self, *args, **kwargs):
"""shortcut for model.generate"""
with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
return self.model.generate(*args, **kwargs)
type(model).generate = generate
return model

View File

@@ -19,8 +19,12 @@ class HFInferenceModel(InferenceModel):
def __init__(self) -> None:
super().__init__()
self.model_config = None
#self.model_name = model_name
# TODO: model_name should probably be an instantiation parameter all the
# way down the inheritance chain.
self.model_name = None
self.path = None
self.hf_torch = False
self.model = None
self.tokenizer = None
@@ -217,6 +221,11 @@ class HFInferenceModel(InferenceModel):
torch.cuda.empty_cache()
except:
pass
def _pre_load(self) -> None:
# HACK: Make model instantiation work without UI parameters
self.model_name = self.model_name or utils.koboldai_vars.model
return super()._pre_load()
def _post_load(self) -> None:
self.badwordsids = koboldai_settings.badwordsids_default

View File

@@ -133,7 +133,8 @@ class model_backend(HFInferenceModel):
utils.koboldai_vars.compiling = True
def mtj_stopped_compiling_callback() -> None:
print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END)
if utils.koboldai_vars.compiling:
print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END)
utils.koboldai_vars.compiling = False
def mtj_settings_callback() -> dict:

View File

@@ -34,6 +34,7 @@ from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.inference_models.hf import HFInferenceModel
from modeling.inference_model import (
GenerationMode,
GenerationResult,
GenerationSettings,
ModelCapabilities,
@@ -92,7 +93,11 @@ class HFTorchInferenceModel(HFInferenceModel):
self.hf_torch = True
self.lazy_load = True
self.low_mem = False
# `nobreakmodel` indicates that breakmodel cannot be used, while `breakmodel`
# indicates whether breakmodel is currently being used
self.nobreakmodel = False
self.breakmodel = False
self.post_token_hooks = [
PostTokenHooks.stream_tokens,
@@ -126,8 +131,13 @@ class HFTorchInferenceModel(HFInferenceModel):
return ret
def get_auxilary_device(self) -> Union[str, int, torch.device]:
return self.breakmodel_config.primary_device
if self.breakmodel:
return self.breakmodel_config.primary_device
if self.usegpu:
return "cuda:0"
else:
return "cpu"
def _get_target_dtype(self) -> Union[torch.float16, torch.float32]:
if self.breakmodel_config.primary_device == "cpu":
return torch.float32
@@ -228,9 +238,6 @@ class HFTorchInferenceModel(HFInferenceModel):
)
class KoboldLogitsWarperList(LogitsProcessorList):
def __init__(self):
pass
def __call__(
lw_self,
input_ids: torch.LongTensor,
@@ -247,17 +254,14 @@ class HFTorchInferenceModel(HFInferenceModel):
), f"Scores are None; processor '{processor}' is to blame"
return scores
def new_get_logits_warper(
beams: int = 1,
) -> LogitsProcessorList:
return KoboldLogitsWarperList()
def new_sample(self, *args, **kwargs):
assert kwargs.pop("logits_warper", None) is not None
kwargs["logits_warper"] = new_get_logits_warper(
beams=1,
)
if utils.koboldai_vars.newlinemode in ["s", "ns"]:
kwargs["logits_warper"] = KoboldLogitsWarperList()
if (
utils.koboldai_vars.newlinemode in ["s", "ns"]
and not m_self.gen_state["allow_eos"]
):
kwargs["eos_token_id"] = -1
kwargs.setdefault("pad_token_id", 2)
return new_sample.old_sample(self, *args, **kwargs)
@@ -329,7 +333,7 @@ class HFTorchInferenceModel(HFInferenceModel):
with torch.no_grad():
start_time = time.time()
genout = self.model.generate(
gen_in,
input_ids=gen_in,
do_sample=True,
max_length=min(
len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
@@ -608,3 +612,9 @@ class HFTorchInferenceModel(HFInferenceModel):
self.breakmodel = False
self.usegpu = False
return
def get_supported_gen_modes(self) -> List[GenerationMode]:
# This changes a torch patch to disallow eos as a bad word.
return super().get_supported_gen_modes() + [
GenerationMode.UNTIL_EOS
]

View File

@@ -1,12 +1,10 @@
from __future__ import annotations
import torch
import requests
import numpy as np
from typing import List, Optional, Union
import utils
from logger import logger
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
@@ -15,29 +13,46 @@ from modeling.inference_model import (
)
model_backend_name = "Read Only"
model_backend_type = "Read Only" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
model_backend_type = "Read Only" # This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
class BasicAPIException(Exception):
"""To be used for errors when using the Basic API as an interface."""
class DummyHFTokenizerOut:
input_ids = np.array([[]])
class FacadeTokenizer:
def __init__(self):
self._koboldai_header = []
def decode(self, _input):
return ""
def encode(self, input_text):
return []
def __call__(self, *args, **kwargs) -> DummyHFTokenizerOut:
return DummyHFTokenizerOut()
class model_backend(InferenceModel):
def __init__(self) -> None:
super().__init__()
# Do not allow API to be served over the API
# Do not allow ReadOnly to be served over the API
self.capabilties = ModelCapabilities(api_host=False)
self.tokenizer = self._tokenizer()
self.tokenizer: FacadeTokenizer = None
self.model = None
self.model_name = "Read Only"
def is_valid(self, model_name, model_path, menu_path):
return model_name == "ReadOnly"
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
def get_requested_parameters(
self, model_name, model_path, menu_path, parameters={}
):
requested_parameters = []
return requested_parameters
def set_input_parameters(self, parameters):
return
@@ -46,17 +61,9 @@ class model_backend(InferenceModel):
def _initialize_model(self):
return
class _tokenizer():
def __init__(self):
self._koboldai_header = []
def decode(self, _input):
return ""
def encode(self, input_text):
return []
def _load(self, save_model: bool = False, initial_load: bool = False) -> None:
self.tokenizer = self.tokenizer
self.tokenizer = FacadeTokenizer()
self.model = None
utils.koboldai_vars.noai = True
@@ -72,7 +79,7 @@ class model_backend(InferenceModel):
):
return GenerationResult(
model=self,
out_batches=np.array([]),
out_batches=np.array([[]]),
prompt=prompt_tokens,
is_whole_generation=True,
single_line=single_line,

View File

@@ -51,15 +51,12 @@ import time
import zipfile
import pickle
import torch
import numpy as np
import collections
import _codecs
import os
from typing import Any, Callable, Dict, Optional, Tuple, Type
from torch import Tensor
from torch.nn import Module
from torch.storage import UntypedStorage
from modeling.pickling import RestrictedUnpickler, use_custom_unpickler
from modeling.patches import LazyloadPatches
# Safetensors is a dependency for the local version, TPU/Colab doesn't
@@ -176,9 +173,6 @@ class TorchLazyTensor(LazyTensor):
CheckpointChunkCache.key = self.key
ziproot = checkpoint.namelist()[0].split("/")[0]
CheckpointChunkCache.handle = checkpoint.open(f"{ziproot}/data/{self.key}", "r")
else:
# Cache hit. Hip hip hooray! :^)
# print(".", end="", flush=True)
@@ -239,86 +233,11 @@ class SafetensorsLazyTensor(LazyTensor):
self.checkpoint_file, tensor_key=self.key, device=self.location
)
def _patched_rebuild_from_type_v2(func, new_type, args, state):
"""A patched version of torch._tensor._rebuild_from_type_v2 that
does not attempt to convert `LazyTensor`s to `torch.Tensor`s."""
ret = func(*args)
# BEGIN PATCH
transformation_ok = isinstance(ret, LazyTensor) and new_type == Tensor
if type(ret) is not new_type and not transformation_ok:
# END PATCH
ret = ret.as_subclass(new_type)
# Tensor does define __setstate__ even though it doesn't define
# __getstate__. So only use __setstate__ if it is NOT the one defined
# on Tensor
if (
getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
is not Tensor.__setstate__
):
ret.__setstate__(state)
else:
ret = torch._utils._set_obj_state(ret, state)
return ret
class RestrictedUnpickler(pickle.Unpickler):
def original_persistent_load(self, saved_id):
return super().persistent_load(saved_id)
def forced_persistent_load(self, saved_id):
if saved_id[0] != "storage":
raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'")
return self.original_persistent_load(saved_id)
def find_class(self, module, name):
if module == "collections" and name == "OrderedDict":
return collections.OrderedDict
elif module == "torch._utils" and name == "_rebuild_tensor_v2":
return torch._utils._rebuild_tensor_v2
elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
return _patched_rebuild_from_type_v2
elif module == "torch" and name in (
"DoubleStorage",
"FloatStorage",
"HalfStorage",
"LongStorage",
"IntStorage",
"ShortStorage",
"CharStorage",
"ByteStorage",
"BoolStorage",
"BFloat16Storage",
"Tensor",
):
return getattr(torch, name)
elif module == "numpy.core.multiarray" and name == "scalar":
return np.core.multiarray.scalar
elif module == "numpy" and name == "dtype":
return np.dtype
elif module == "_codecs" and name == "encode":
return _codecs.encode
else:
# Forbid everything else.
qualified_name = name if module == "__builtin__" else f"{module}.{name}"
raise pickle.UnpicklingError(
f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}"
)
def load(self, *args, **kwargs):
self.original_persistent_load = getattr(
self, "persistent_load", pickle.Unpickler.persistent_load
)
self.persistent_load = self.forced_persistent_load
return super().load(*args, **kwargs)
class _LazyUnpickler(RestrictedUnpickler):
lazy_loaded_storages: Dict[str, LazyTensor]
def __init__(self, *args, **kwargs):
# print(args, kwargs)
self.lazy_loaded_storages = {}
return super().__init__(*args, **kwargs)
@@ -376,7 +295,7 @@ def patch_safetensors(callback):
# (70 tensors/s -> 65 tensor/s). The memory savings probably
# shouldn't be the happening, maybe there's a memory leak
# somewhere in our pipeline with CPU tensors.
intermediary_device = "cuda"
intermediary_device = "cuda:0"
else:
intermediary_device = "cpu"
@@ -409,27 +328,9 @@ def patch_safetensors(callback):
return tensors
transformers.modeling_utils.safe_load_file = safetensors_load
safetensors.torch.load_file = safetensors_load
@contextlib.contextmanager
def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler):
try:
old_unpickler = pickle.Unpickler
pickle.Unpickler = unpickler
old_pickle_load = pickle.load
def new_pickle_load(*args, **kwargs):
return pickle.Unpickler(*args, **kwargs).load()
pickle.load = new_pickle_load
yield
finally:
pickle.Unpickler = old_unpickler
pickle.load = old_pickle_load
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
for hook in self._load_state_dict_pre_hooks.values():
hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)

View File

@@ -233,6 +233,8 @@ class PhraseBiasLogitsProcessor:
token_seqs = self._get_token_sequence(phrase)
variant_deltas = {}
for token_seq in token_seqs:
if not token_seq:
continue
bias_index = self._find_intersection(input_ids, token_seq)
# Ensure completion after completion_threshold tokens
@@ -267,6 +269,14 @@ class PhraseBiasLogitsProcessor:
for batch in range(scores_shape[0]):
for token, bias in self._get_biased_tokens(input_ids[batch]).items():
scores[batch][token] += bias
if bias > 0 and bool(scores[batch][token].isneginf()):
# Adding bias to -inf will do NOTHING!!! So just set it for
# now. There may be more mathishly correct way to do this
# but it'll work. Also, make sure the bias is actually
# positive. Don't give a -inf token more chance by setting
# it to -0.5!
scores[batch][token] = bias
else:
scores[batch][token] += bias
return scores

View File

@@ -129,15 +129,33 @@ def patch_transformers_generation() -> None:
class LazyloadPatches:
class StateDictFacade(dict):
def __init__(self, state_dict):
self.update(state_dict)
def __getitem__(self, name):
return super().__getitem__(name).materialize(map_location="cuda:0")
old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model
torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict
def __enter__() -> None:
transformers.modeling_utils._load_state_dict_into_meta_model = (
LazyloadPatches._load_state_dict_into_meta_model
)
torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict
def __exit__(exc_type, exc_value, exc_traceback) -> None:
transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict
torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict
def _torch_load_from_state_dict(self, state_dict, *args, **kwargs):
return LazyloadPatches.torch_old_load_from_state_dict(
self,
LazyloadPatches.StateDictFacade(state_dict),
*args,
**kwargs
)
def _load_state_dict_into_meta_model(
model,

111
modeling/pickling.py Normal file
View File

@@ -0,0 +1,111 @@
from __future__ import annotations
import collections
import contextlib
import pickle
import _codecs
import numpy as np
import torch
from torch import Tensor
import modeling
def _patched_rebuild_from_type_v2(func, new_type, args, state):
"""A patched version of torch._tensor._rebuild_from_type_v2 that
does not attempt to convert `LazyTensor`s to `torch.Tensor`s."""
ret = func(*args)
# BEGIN PATCH
transformation_ok = isinstance(ret, modeling.lazy_loader.LazyTensor) and new_type == Tensor
if type(ret) is not new_type and not transformation_ok:
# END PATCH
ret = ret.as_subclass(new_type)
# Tensor does define __setstate__ even though it doesn't define
# __getstate__. So only use __setstate__ if it is NOT the one defined
# on Tensor
if (
getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
is not Tensor.__setstate__
):
ret.__setstate__(state)
else:
ret = torch._utils._set_obj_state(ret, state)
return ret
class RestrictedUnpickler(pickle.Unpickler):
def original_persistent_load(self, saved_id):
return super().persistent_load(saved_id)
def forced_persistent_load(self, saved_id):
if saved_id[0] != "storage":
raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'")
return self.original_persistent_load(saved_id)
def find_class(self, module, name):
if module == "collections" and name == "OrderedDict":
return collections.OrderedDict
elif module == "torch._utils" and name in (
"_rebuild_tensor_v2",
"_rebuild_meta_tensor_no_storage",
):
return getattr(torch._utils, name)
elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
return _patched_rebuild_from_type_v2
elif module == "torch" and name in (
"DoubleStorage",
"FloatStorage",
"HalfStorage",
"LongStorage",
"IntStorage",
"ShortStorage",
"CharStorage",
"ByteStorage",
"BoolStorage",
"BFloat16Storage",
"Tensor",
"float16",
):
return getattr(torch, name)
elif module == "numpy.core.multiarray" and name == "scalar":
return np.core.multiarray.scalar
elif module == "numpy" and name == "dtype":
return np.dtype
elif module == "_codecs" and name == "encode":
return _codecs.encode
else:
# Forbid everything else.
qualified_name = name if module == "__builtin__" else f"{module}.{name}"
raise pickle.UnpicklingError(
f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}"
)
def load(self, *args, **kwargs):
self.original_persistent_load = getattr(
self, "persistent_load", pickle.Unpickler.persistent_load
)
self.persistent_load = self.forced_persistent_load
return super().load(*args, **kwargs)
@contextlib.contextmanager
def use_custom_unpickler(unpickler: pickle.Unpickler = RestrictedUnpickler):
try:
old_unpickler = pickle.Unpickler
pickle.Unpickler = unpickler
old_pickle_load = pickle.load
def new_pickle_load(*args, **kwargs):
return pickle.Unpickler(*args, **kwargs).load()
pickle.load = new_pickle_load
yield
finally:
pickle.Unpickler = old_unpickler
pickle.load = old_pickle_load

View File

@@ -3,15 +3,12 @@ from __future__ import annotations
import torch
import utils
from modeling.inference_model import (
InferenceModel,
)
from modeling import inference_model
class Stoppers:
@staticmethod
def core_stopper(
model: InferenceModel,
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
if not utils.koboldai_vars.inference_config.do_core:
@@ -62,7 +59,7 @@ class Stoppers:
@staticmethod
def dynamic_wi_scanner(
model: InferenceModel,
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
if not utils.koboldai_vars.inference_config.do_dynamic_wi:
@@ -93,7 +90,7 @@ class Stoppers:
@staticmethod
def chat_mode_stopper(
model: InferenceModel,
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
if not utils.koboldai_vars.chatmode:
@@ -118,7 +115,7 @@ class Stoppers:
@staticmethod
def stop_sequence_stopper(
model: InferenceModel,
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
@@ -126,7 +123,12 @@ class Stoppers:
# null_character = model.tokenizer.encode(chr(0))[0]
if "completed" not in model.gen_state:
model.gen_state["completed"] = [False] * len(input_ids)
if utils.koboldai_vars.adventure:
extra_options = [">", "\n>"]
for option in extra_options:
if option not in utils.koboldai_vars.stop_sequence:
utils.koboldai_vars.stop_sequence.append(option)
#one issue is that the stop sequence may not actual align with the end of token
#if its a subsection of a longer token
for stopper in utils.koboldai_vars.stop_sequence:
@@ -140,19 +142,31 @@ class Stoppers:
if all(model.gen_state["completed"]):
utils.koboldai_vars.generated_tkns = utils.koboldai_vars.genamt
del model.gen_state["completed"]
if utils.koboldai_vars.adventure: # Remove added adventure mode stop sequences
for option in extra_options:
if option in utils.koboldai_vars.stop_sequence:
utils.koboldai_vars.stop_sequence.remove(option)
return True
return False
@staticmethod
def singleline_stopper(
model: InferenceModel,
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
"""If singleline mode is enabled, it's pointless to generate output beyond the first newline."""
"""Stop on occurances of newlines **if singleline is enabled**."""
# It might be better just to do this further up the line
if not utils.koboldai_vars.singleline:
return False
return Stoppers.newline_stopper(model, input_ids)
@staticmethod
def newline_stopper(
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
"""Stop on occurances of newlines."""
# Keep track of presence of newlines in each sequence; we cannot stop a
# batch member individually, so we must wait for all of them to contain
# a newline.
@@ -167,3 +181,30 @@ class Stoppers:
del model.gen_state["newline_in_sequence"]
return True
return False
@staticmethod
def sentence_end_stopper(
model: inference_model.InferenceModel,
input_ids: torch.LongTensor,
) -> bool:
"""Stops at the end of sentences."""
# TODO: Make this more robust
SENTENCE_ENDS = [".", "?", "!"]
# We need to keep track of stopping for each batch, since we can't stop
# one individually.
if "sentence_end_in_sequence" not in model.gen_state:
model.gen_state["sentence_end_sequence"] = [False] * len(input_ids)
for sequence_idx, batch_sequence in enumerate(input_ids):
decoded = model.tokenizer.decode(batch_sequence[-1])
for end in SENTENCE_ENDS:
if end in decoded:
model.gen_state["sentence_end_sequence"][sequence_idx] = True
break
if all(model.gen_state["sentence_end_sequence"]):
del model.gen_state["sentence_end_sequence"]
return True
return False

View File

@@ -39,7 +39,6 @@ pytest-metadata==2.0.4
requests-mock==1.10.0
safetensors==0.3.1
git+https://github.com/0cc4m/hf_bleeding_edge/
--find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4
einops
peft==0.3.0
scipy
scipy

File diff suppressed because one or more lines are too long

View File

@@ -1,10 +1,10 @@
/*----------------Global Colors------------------*/
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
:root {
--flyout_menu_width: 100%;
}
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
:root {
--flyout_menu_width: 402px;
}
@@ -448,19 +448,19 @@ border-top-right-radius: var(--tabs_rounding);
cursor: pointer;
}
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
/* mobile */
.menu_icon.hidden {
display: inline-block !important;
}
}
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
.menu_pin {
display: none;
}
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
.menu_pin {
position: absolute;
top:10px;
@@ -516,7 +516,7 @@ border-top-right-radius: var(--tabs_rounding);
will-change: transform;
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
.SideMenu.pinned {
right: calc(100% - var(--flyout_menu_width));
background-color: var(--flyout_background_pinned);
@@ -906,7 +906,7 @@ border-top-right-radius: var(--tabs_rounding);
grid-area: lefticon;
}
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
/* mobile */
.right_menu_icon.hidden {
display: inline-block !important;
@@ -937,7 +937,7 @@ border-top-right-radius: var(--tabs_rounding);
left: calc(100% - var(--flyout_menu_width));
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
/* Desktop Mode */
.rightSideMenu.pinned {
left: calc(100% - var(--flyout_menu_width));
@@ -959,14 +959,14 @@ border-top-right-radius: var(--tabs_rounding);
filter: brightness(40%);
}
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
/* mobile */
.story_menu_pin {
display: none;
}
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
.story_menu_pin {
position: absolute;
top:10px;
@@ -1259,17 +1259,46 @@ td.server_vars {
.world_info_label_container > .generate-button:hover { opacity: 1.0; }
.tag {
display: inline-block;
background-color: var(--wi_tag_color);
color: var(--wi_tag_text_color);
margin-right: 3px;
margin-top: 3px;
padding: 2px;
margin-right: 2px;
padding-left: 3px;
padding-right: 3px;
border-radius: var(--radius_wi_card);
border: solid;
border-color: var(--wi_tag_color);
}
.tag .tag_button {
cursor: pointer;
opacity: 0.4;
font-size: 16px;
position: relative;
}
.tag .delete_icon {
cursor: pointer;
top: 3px;
right: 3px;
}
.tag .add_icon {
top: 3px;
right: 3px;
}
.tag .tag_text {
display: inline-block;
outline: none;
position: relative;
right: 3px;
}
.placeholder_tag .tag_text:empty {
opacity: 0.4;
}
.oi[folder] {
@@ -1457,6 +1486,30 @@ td.server_vars {
line-height: 2;
}
/* Privacy Mode (Lock Screen) */
#privacy_mode {
height: unset;
width: unset;
position: relative;
top: unset;
left: unset;
}
#privacy_mode .popup_list_area {
display: flex;
align-items: center;
flex-direction: column;
padding-top: 10px;
padding-bottom: 10px;
padding-left: 15px;
padding-right: 15px;
}
#privacy_mode input {
margin-top: 15px;
width: 85%;
}
/* ---------------------------- OVERALL PAGE CONFIG ------------------------------*/
body {
background-color: var(--background);
@@ -1489,7 +1542,7 @@ body {
background-color: #cacaca80;
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
/* ------------------- Desktop Mode --------------------------- */
.main-grid {
transition: margin-left .5s, margin-right .5s;
@@ -1504,7 +1557,7 @@ body {
grid-template-columns: 30px auto 30% 30px;
grid-template-rows: auto min-content min-content 100px;
}
.main-grid[option_length="0"][model_numseqs="1"] {
.main-grid[hide-options="true"] {
grid-template-columns: 30px auto 0px 30px;
}
@@ -1523,7 +1576,7 @@ body {
}
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
/* mobile */
.main-grid {
transition: margin-left .5s;
@@ -1589,39 +1642,43 @@ body {
font-style: italic;
}
.sequence_area {
#token-stream-buffer {
white-space: pre-wrap;
}
#option-container {
margin-top: 10px;
grid-area: options;
background-color: var(--sequence_area_background);
overflow-y: scroll;
}
.sequence_area::-webkit-scrollbar {
#option-container::-webkit-scrollbar {
display: none;
}
@media only screen and (max-aspect-ratio: 7/5) {
.sequences {
margin-top: 5px;
width: 100%;
border: 0px;
border-spacing: 0;
display: flex;
flex-direction: row;
overflow-x: scroll;
scroll-snap-type: x mandatory;
}
@media only screen and (max-aspect-ratio: 5/6) {
#option-container {
margin-top: 5px;
width: 100%;
border: 0px;
border-spacing: 0;
display: flex;
flex-direction: row;
overflow-x: scroll;
scroll-snap-type: x mandatory;
}
}
@media only screen and (min-aspect-ratio: 7/5) {
.sequences {
margin-top: 5px;
width: 100%;
border: 0px;
border-spacing: 0;
display: flex;
flex-direction: column;
}
@media only screen and (min-aspect-ratio: 5/6) {
#option-container {
margin-top: 5px;
width: 100%;
border: 0px;
border-spacing: 0;
display: flex;
flex-direction: column;
}
}
.sequence_row {
@@ -1907,7 +1964,7 @@ body {
overflow: hidden;
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
.paddingimage {
grid-area: paddingimage;
margin: auto auto auto auto;
@@ -1915,14 +1972,14 @@ body {
}
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
.paddingimage {
visibility: hidden;
}
}
/*---------------------------------- Popups -------------------------------------------------*/
@media only screen and (max-aspect-ratio: 7/5) {
@media only screen and (max-aspect-ratio: 5/6) {
.popup {
position: absolute;
top: 10vh;
@@ -1939,7 +1996,7 @@ body {
}
}
@media only screen and (min-aspect-ratio: 7/5) {
@media only screen and (min-aspect-ratio: 5/6) {
.popup {
position: absolute;
top: 10vh;
@@ -1962,6 +2019,7 @@ body {
color: var(--popup_title_bar_color_text);
text-align: center;
font-size: calc(1.3em + var(--font_size_adjustment));
user-select: none;
}
.popup .action_button {
@@ -2705,13 +2763,14 @@ body {
#context-menu > hr {
/* Division Color*/
border-top: 2px solid var(--context_menu_division);
margin: 5px 5px;
margin: 3px 5px;
}
.context-menu-item {
padding: 5px;
padding: 4px;
padding-right: 25px;
min-width: 100px;
white-space: nowrap;
}
.context-menu-item:hover {
@@ -2722,11 +2781,16 @@ body {
.context-menu-item > .material-icons-outlined {
position: relative;
top: 2px;
top: 3px;
font-size: 15px;
margin-right: 5px;
}
.context-menu-item > .context-menu-label {
position: relative;
top: 1px;
}
/* Substitutions */
#Substitutions {
margin-left: 10px;
@@ -2820,6 +2884,10 @@ body {
height: 100%;
}
#welcome_text a {
text-decoration: underline;
}
.welcome_text {
display: flex;
height: 100%;
@@ -2848,6 +2916,7 @@ body {
display: flex;
justify-content: center;
align-items: center;
pointer-events: none;
}
#welcome-text-content {
@@ -3537,10 +3606,15 @@ h2 .material-icons-outlined {
}
.section_header {
font-weight: bold;
margin-left: 2px;
margin-bottom: 2px;
}
.story_category_area > * > label {
user-select: none
}
.help_text {
margin-left: 6px;
margin-bottom: 0.7em;

File diff suppressed because it is too large Load Diff

View File

@@ -44,7 +44,7 @@
</div>
<!------------ Main Screen--------------------->
<div id="main-grid" class="main-grid settings_pinned var_sync_alt_model_numseqs" onclick="close_menus();" option_length="0">
<div id="main-grid" class="main-grid settings_pinned var_sync_alt_model_numseqs" onclick="close_menus();" hide-options="true">
<!------------ Game Text Screen--------------------->
<div class="gamescreen" id="gamescreen" context-menu="gamescreen">
<div id="disconnect_message"><center><h1>Disconnected</h1></center></div>
@@ -53,13 +53,13 @@
<div id="welcome_text" class="var_sync_model_welcome" draggable="False"></div>
</div>
<div class="gametext" id="Selected Text" contenteditable=false tabindex=0 onpaste="check_game_after_paste()" onfocusout="savegametextchanges();" onclick="return set_edit(event)" onkeyup="return set_edit(event);">
<div class="gametext" id="Selected Text" contenteditable="false" tabindex="0" onkeyup="return set_edit(event);">
<span id="story_prompt" class="var_sync_story_prompt var_sync_alt_story_prompt_in_ai rawtext hidden" chunk="-1"></span></div><!--don't move the /div down or it'll cause odd spacing issues in the UI--->
</div>
<!------------ Sequences --------------------->
<div id="action_count" class="var_sync_actions_Action_Count hidden"></div>
<div id="Select Options" class="sequence_area"></div>
<div id="option-container" class="hidden"></div>
<!-- Story Review -->
<div id="story-review" class="hidden">
@@ -110,9 +110,9 @@
<button type="button" class="btn action_button" style="width: 30px; padding: 0px;" onclick='play_pause_tts()' aria-label="play"><span id="play_tts" class="material-icons-outlined" style="font-size: 1.4em;">play_arrow</span></button>
<button type="button" class="btn action_button" style="width: 30px; padding: 0px;" onclick='stop_tts()' aria-label="play"><span id="stop_tts" class="material-icons-outlined" style="font-size: 1.4em;">stop</span></button>
</span>
<button type="button" class="btn action_button submit var_sync_alt_system_aibusy" system_aibusy=False id="btnsubmit" onclick="storySubmit();">Submit</button>
<button type="button" class="btn action_button submit var_sync_alt_system_aibusy" system_aibusy=False id="btnsubmit" onclick="storySubmit();" context-menu="submit-button">Submit</button>
<button type="button" class="btn action_button submited var_sync_alt_system_aibusy" system_aibusy=False id="btnsent"><img id="thinking" src="static/thinking.gif" class="force_center" onclick="socket.emit('abort','');"></button>
<button type="button" class="btn action_button back var_sync_alt_system_aibusy" system_aibusy=False onclick="storyBack();" aria-label="undo"><span class="material-icons-outlined" style="font-size: 1.4em;">replay</span></button>
<button type="button" class="btn action_button back var_sync_alt_system_aibusy" system_aibusy=False onclick="storyBack();" aria-label="undo" context-menu="undo-button"><span class="material-icons-outlined" style="font-size: 1.4em;">replay</span></button>
<button type="button" class="btn action_button redo var_sync_alt_system_aibusy" system_aibusy=False onclick="storyRedo();" aria-label="redo"><span class="material-icons-outlined" style="font-size: 1.4em;">arrow_forward</span></button>
<button type="button" class="btn action_button retry var_sync_alt_system_aibusy" system_aibusy=False onclick="storyRetry();" aria-label="retry"><span class="material-icons-outlined" style="font-size: 1.4em;">autorenew</span></button>
</div>

View File

@@ -70,12 +70,12 @@
</div>
</div>
<!---------------- Private Mode Unlock screen ---------------------->
<div id="privacy_mode" class="popup-window popup">
<div id="privacy_mode" class="popup-window popup" allow-close="false">
<div class="title">
<div class="popuptitletext">Locked</div>
</div>
<div id="popup_list_area" class="popup_list_area">
This story is in private mode. Please enter password to unlock<br/>
This story is in private mode. Please enter the password to unlock it.<br/>
<input type="password" id="privacy_password"/>
</div>
<div class="popup_load_cancel">

View File

@@ -50,6 +50,14 @@
<label for="authors_notes">Author's Notes:</label><br/>
<textarea autocomplete="off" rows=16 id="authors_notes" class="var_sync_story_authornote var_sync_alt_story_authornote_length fullwidth" oninput="autoResize(this)" onchange='sync_to_server(this);'></textarea><br/>
<div class="setting_tile_area">
{% with menu='author_notes' %}
{% with sub_path='' %}
{% include 'settings item.html' %}
{% endwith %}
{% endwith %}
</div>
<h4 class="section_header">Genre</h4>
<div class="help_text">Styles the AI will attempt to imitate. Effectiveness depends on model.</div>
<input id="genre-input" class="fullwidth" placeholder="Fantasy" autocomplete="off" spellcheck="false">
@@ -75,14 +83,6 @@
}
</script>
</div>
<div class="setting_tile_area">
{% with menu='author_notes' %}
{% with sub_path='' %}
{% include 'settings item.html' %}
{% endwith %}
{% endwith %}
</div>
</div>
</div>
<div id="story_menu_notes" class="story_category_area tab-target tab-target-story hidden">
@@ -97,7 +97,7 @@
<div id="story_menu_wi" class="story_category_area tab-target tab-target-story hidden">
<h4 class="section_header" style="margin-left: 12px;">World Info</h4>
<div class="help_text" style="margin-left: 20px;">
Lore information, which the AI recalls by certain words.
Lore information, which the AI recalls with the mention of certain words.
<span class="helpicon material-icons-outlined" tooltip="Use this instead of Memory for information on things like characters, objects, events, places, and anything else with detail.">help_icon</span>
</div>
<div class="setting_tile_area wi_settings">

View File

@@ -22,12 +22,16 @@
<span
class="world_info_item_type"
contenteditable="true"
data-placeholder="Person"
data-placeholder="..."
spellcheck="false"
></span> <span class="helpicon material-icons-outlined" tooltip="Please enter a noun that describes a person, place or thing." "]">help_icon</span>
></span>
<span
class="helpicon material-icons-outlined"
tooltip='Please enter a noun that describes this entry. For example, "person", "weapon", or "building". This will be used with the Generate Content button below.'
>help_icon</span>
</div>
</div>
<span id="world_info_delete_" class="world_info_delete">X</span>
<span id="world_info_delete_" class="world_info_delete material-icons-outlined">close</span>
</div>
<div class="world_info_upper_container world_info_tag_area">

View File

@@ -1 +1 @@
#welcome_text { display:none; pointer-events: none }
#welcome-logo { display:none; pointer-events: none }

View File

@@ -1116,10 +1116,11 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
thread_resources_env = maps.ResourceEnv(maps.Mesh(devices, ('dp', 'mp')), ())
maps.thread_resources.env = thread_resources_env
if initial_load:
logger.message(f"KoboldAI has finished loading and is available at the following link for UI 1: {koboldai_vars.cloudflare_link}")
logger.message(f"KoboldAI has finished loading and is available at the following link for UI 2: {koboldai_vars.cloudflare_link}/new_ui")
logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api")
logger.message(f"KoboldAI has still loading your model but available at the following link: {koboldai_vars.cloudflare_link}")
logger.message(f"KoboldAI has still loading your model but available at the following link for the Classic UI: {koboldai_vars.cloudflare_link}/classic")
logger.message(f"KoboldAI has still loading your model but available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
logger.message(f"KoboldAI has still loading your model but available at the following link for the API: [Loading Model...]")
logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.")
global badwords
# These are the tokens that we don't want the AI to ever write
@@ -1302,7 +1303,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
try:
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False)
except Exception as e:
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
@@ -1317,7 +1318,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
try:
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache")
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False)
except Exception as e:
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache")
else:
@@ -1332,7 +1333,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
try:
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache")
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False)
except Exception as e:
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache")