diff --git a/README_GPTQ.md b/README_GPTQ.md index e1961cb8..77966c7d 100644 --- a/README_GPTQ.md +++ b/README_GPTQ.md @@ -2,31 +2,13 @@ (This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) #### Installation -In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. +For Nvidia users everything is automatically installed when you install the requirements, you merely need a compatible GPTQ model for it to show up. -Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems. - -`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` - -`cd KoboldAI` - -Next step, (Windows) subfolder mode or B: option doesn't matter choose either - -* [if on Windows] - ``` - install_requirements.bat - ``` - * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. - -* [if on Linux with Nvidia] - ``` - ./install_requirements.sh - ``` * [if on Linux with AMD] ``` ./install_requirements.sh rocm ./commandline-rocm.sh - pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4 + pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa ``` * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed * If you get CUDA_HOME envar is not set run in env: @@ -46,5 +28,5 @@ If you haven't done so already, exit the command prompt/leave KAI's conda env. ( Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD] -Switch to UI2, then load your model. +Load your model using Huggingface GPTQ as the backend option (This will show up when a valid GPTQ model is detected). diff --git a/aiserver.py b/aiserver.py index dc565c97..0552eb60 100644 --- a/aiserver.py +++ b/aiserver.py @@ -12,6 +12,8 @@ import random import shutil import eventlet +from modeling.inference_model import GenerationMode + eventlet.monkey_patch(all=True, thread=False, os=False) import os, inspect, contextlib, pickle os.system("") @@ -71,6 +73,12 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForToken import transformers import ipaddress from functools import wraps +from modeling.pickling import RestrictedUnpickler, use_custom_unpickler + +# Make settings folder early so we can depend on it anywhere +if not os.path.exists("settings/"): + os.mkdir("settings") + try: from transformers.models.opt.modeling_opt import OPTDecoder except: @@ -630,7 +638,10 @@ model_backends = {} model_backend_module_names = {} model_backend_type_crosswalk = {} -PRIORITIZED_BACKEND_MODULES = ["generic_hf_torch"] +PRIORITIZED_BACKEND_MODULES = { + "gptq_hf_torch": 2, + "generic_hf_torch": 1 +} for module in os.listdir("./modeling/inference_models"): if module == '__pycache__': @@ -666,10 +677,15 @@ for module in os.listdir("./modeling/inference_models"): model_backend_module_names[backend_name] = module if backend_type in model_backend_type_crosswalk: - if module in PRIORITIZED_BACKEND_MODULES: - model_backend_type_crosswalk[backend_type].insert(0, backend_name) - else: - model_backend_type_crosswalk[backend_type].append(backend_name) + model_backend_type_crosswalk[backend_type].append(backend_name) + model_backend_type_crosswalk[backend_type] = list(sorted( + model_backend_type_crosswalk[backend_type], + key=lambda name: PRIORITIZED_BACKEND_MODULES.get( + [mod for b_name, mod in model_backend_module_names.items() if b_name == name][0], + 0 + ), + reverse=True + )) else: model_backend_type_crosswalk[backend_type] = [backend_name] @@ -892,7 +908,7 @@ tags = [ api_version = None # This gets set automatically so don't change this value api_v1 = KoboldAPISpec( - version="1.2.2", + version="1.2.3", prefixes=["/api/v1", "/api/latest"], tags=tags, ) @@ -1670,75 +1686,7 @@ def unload_model(): #Reload our badwords koboldai_vars.badwordsids = koboldai_settings.badwordsids_default -class RestrictedUnpickler(pickle.Unpickler): - def original_persistent_load(self, saved_id): - return super().persistent_load(saved_id) - def forced_persistent_load(self, saved_id): - if saved_id[0] != "storage": - raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'") - return self.original_persistent_load(saved_id) - - def find_class(self, module, name): - if module == "collections" and name == "OrderedDict": - return collections.OrderedDict - elif module == "torch._utils" and name == "_rebuild_tensor_v2": - return torch._utils._rebuild_tensor_v2 - elif module == "torch._tensor" and name == "_rebuild_from_type_v2": - return torch._tensor._rebuild_from_type_v2 - elif module == "torch" and name in ( - "DoubleStorage", - "FloatStorage", - "HalfStorage", - "LongStorage", - "IntStorage", - "ShortStorage", - "CharStorage", - "ByteStorage", - "BoolStorage", - "BFloat16Storage", - "Tensor", - ): - return getattr(torch, name) - elif module == "numpy.core.multiarray" and name == "scalar": - return np.core.multiarray.scalar - elif module == "numpy" and name == "dtype": - return np.dtype - elif module == "_codecs" and name == "encode": - return _codecs.encode - else: - # Forbid everything else. - qualified_name = name if module == "__builtin__" else f"{module}.{name}" - raise pickle.UnpicklingError( - f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}" - ) - - def load(self, *args, **kwargs): - self.original_persistent_load = getattr( - self, "persistent_load", pickle.Unpickler.persistent_load - ) - self.persistent_load = self.forced_persistent_load - return super().load(*args, **kwargs) - -@contextlib.contextmanager -def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler): - try: - old_unpickler = pickle.Unpickler - pickle.Unpickler = unpickler - - old_pickle_load = pickle.load - - def new_pickle_load(*args, **kwargs): - return pickle.Unpickler(*args, **kwargs).load() - - pickle.load = new_pickle_load - - yield - - finally: - pickle.Unpickler = old_unpickler - pickle.load = old_pickle_load - def load_model(model_backend, initial_load=False): global model global tokenizer @@ -1747,9 +1695,6 @@ def load_model(model_backend, initial_load=False): koboldai_vars.aibusy = True koboldai_vars.horde_share = False - if initial_load: - use_breakmodel_args = True - koboldai_vars.reset_model() koboldai_vars.noai = False @@ -1788,7 +1733,9 @@ def load_model(model_backend, initial_load=False): with use_custom_unpickler(RestrictedUnpickler): model = model_backends[model_backend] + koboldai_vars.supported_gen_modes = [x.value for x in model.get_supported_gen_modes()] model.load(initial_load=initial_load, save_model=not (args.colab or args.cacheonly) or args.savemodel) + koboldai_vars.model = model.model_name if "model_name" in vars(model) else model.id #Should have model_name, but it could be set to id depending on how it's setup if koboldai_vars.model in ("NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): koboldai_vars.model = os.path.basename(os.path.normpath(model.path)) @@ -1889,8 +1836,8 @@ def load_model(model_backend, initial_load=False): os.mkdir("./softprompts") koboldai_vars.splist = [[f, get_softprompt_desc(os.path.join("./softprompts", f),None,True)] for f in os.listdir("./softprompts") if os.path.isfile(os.path.join("./softprompts", f)) and valid_softprompt(os.path.join("./softprompts", f))] if initial_load and koboldai_vars.cloudflare_link != "": - logger.message(f"KoboldAI has finished loading and is available at the following link for UI 1: {koboldai_vars.cloudflare_link}") - logger.message(f"KoboldAI has finished loading and is available at the following link for UI 2: {koboldai_vars.cloudflare_link}/new_ui") + logger.message(f"KoboldAI has finished loading and is available at the following link: {koboldai_vars.cloudflare_link}") + logger.message(f"KoboldAI has finished loading and is available at the following link for the Classic UI: {koboldai_vars.cloudflare_link}/classic") logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite") logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api") @@ -1922,8 +1869,7 @@ def require_allowed_ip(func): # Set up Flask routes -@app.route('/') -@app.route('/index') +@app.route('/classic') @require_allowed_ip def index(): if args.no_ui: @@ -3267,11 +3213,20 @@ def check_for_backend_compilation(): break koboldai_vars.checking = False -def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, disable_recentrng=False, no_generate=False, ignore_aibusy=False): +def actionsubmit( + data, + actionmode=0, + force_submit=False, + force_prompt_gen=False, + disable_recentrng=False, + no_generate=False, + ignore_aibusy=False, + gen_mode=GenerationMode.STANDARD +): # Ignore new submissions if the AI is currently busy - if(koboldai_vars.aibusy): + if koboldai_vars.aibusy and not ignore_aibusy: return - + while(True): set_aibusy(1) koboldai_vars.actions.clear_unused_options() @@ -3359,7 +3314,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, koboldai_vars.prompt = data # Clear the startup text from game screen emit('from_server', {'cmd': 'updatescreen', 'gamestarted': False, 'data': 'Please wait, generating story...'}, broadcast=True, room="UI_1") - calcsubmit("") # Run the first action through the generator + calcsubmit("", gen_mode=gen_mode) # Run the first action through the generator if(not koboldai_vars.abort and koboldai_vars.lua_koboldbridge.restart_sequence is not None and len(koboldai_vars.genseqs) == 0): data = "" force_submit = True @@ -3425,7 +3380,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, if(not no_generate and not koboldai_vars.noai and koboldai_vars.lua_koboldbridge.generating): # Off to the tokenizer! - calcsubmit("") + calcsubmit("", gen_mode=gen_mode) if(not koboldai_vars.abort and koboldai_vars.lua_koboldbridge.restart_sequence is not None and len(koboldai_vars.genseqs) == 0): data = "" force_submit = True @@ -3780,7 +3735,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None, #==================================================================# # Take submitted text and build the text to be given to generator #==================================================================# -def calcsubmit(txt): +def calcsubmit(txt, gen_mode=GenerationMode.STANDARD): anotetxt = "" # Placeholder for Author's Note text forceanote = False # In case we don't have enough actions to hit A.N. depth anoteadded = False # In case our budget runs out before we hit A.N. depth @@ -3822,7 +3777,7 @@ def calcsubmit(txt): logger.debug("Submit: experimental_features time {}s".format(time.time()-start_time)) start_time = time.time() - generate(subtxt, min, max, found_entries) + generate(subtxt, min, max, found_entries, gen_mode=gen_mode) logger.debug("Submit: generate time {}s".format(time.time()-start_time)) attention_bias.attention_bias = None @@ -3890,7 +3845,14 @@ class HordeException(Exception): # Send text to generator and deal with output #==================================================================# -def generate(txt, minimum, maximum, found_entries=None): +def generate(txt, minimum, maximum, found_entries=None, gen_mode=GenerationMode.STANDARD): + # Open up token stream + emit("stream_tokens", True, broadcast=True, room="UI_2") + + # HACK: Show options when streaming more than 1 sequence + if utils.koboldai_vars.output_streaming: + koboldai_vars.actions.show_options(koboldai_vars.numseqs > 1, force=True) + koboldai_vars.generated_tkns = 0 if(found_entries is None): @@ -3912,7 +3874,7 @@ def generate(txt, minimum, maximum, found_entries=None): # Submit input text to generator try: start_time = time.time() - genout, already_generated = tpool.execute(model.core_generate, txt, found_entries) + genout, already_generated = tpool.execute(model.core_generate, txt, found_entries, gen_mode=gen_mode) logger.debug("Generate: core_generate time {}s".format(time.time()-start_time)) except Exception as e: if(issubclass(type(e), lupa.LuaError)): @@ -3927,7 +3889,10 @@ def generate(txt, minimum, maximum, found_entries=None): emit('from_server', {'cmd': 'errmsg', 'data': 'Error occurred during generator call; please check console.'}, broadcast=True, room="UI_1") logger.error(traceback.format_exc().replace("\033", "")) socketio.emit("error", str(e), broadcast=True, room="UI_2") + set_aibusy(0) + # Clean up token stream + emit("stream_tokens", None, broadcast=True, room="UI_2") return for i in range(koboldai_vars.numseqs): @@ -3959,7 +3924,10 @@ def generate(txt, minimum, maximum, found_entries=None): del genout gc.collect() torch.cuda.empty_cache() - + + # Clean up token stream + emit("stream_tokens", None, broadcast=True, room="UI_2") + maybe_review_story() set_aibusy(0) @@ -4428,8 +4396,8 @@ def requestwi(): # and items in different folders are sorted based on the order of the folders #==================================================================# def stablesortwi(): - mapping = {uid: index for index, uid in enumerate(koboldai_vars.wifolders_l)} - koboldai_vars.worldinfo.sort(key=lambda x: mapping[str(x["folder"])] if x["folder"] is not None else float("inf")) + mapping = {int(uid): index for index, uid in enumerate(koboldai_vars.wifolders_l)} + koboldai_vars.worldinfo.sort(key=lambda x: mapping[int(x["folder"])] if x["folder"] is not None else float("inf")) last_folder = ... last_wi = None for i, wi in enumerate(koboldai_vars.worldinfo): @@ -5134,9 +5102,13 @@ def load_story_v1(js, from_file=None): def load_story_v2(js, from_file=None): logger.debug("Loading V2 Story") logger.debug("Called from {}".format(inspect.stack()[1].function)) - leave_room(session['story']) - session['story'] = js['story_name'] - join_room(session['story']) + + new_story = js["story_name"] + # In socket context + if hasattr(request, "sid"): + leave_room(session['story']) + join_room(new_story) + session['story'] = new_story koboldai_vars.load_story(session['story'], js) @@ -5564,6 +5536,7 @@ def lite_html(): #==================================================================# # UI V2 CODE #==================================================================# +@app.route('/') @app.route('/new_ui') @require_allowed_ip @logger.catch @@ -6149,6 +6122,7 @@ def UI_2_Set_Selected_Text(data): @socketio.on('Use Option Text') @logger.catch def UI_2_Use_Option_Text(data): + koboldai_vars.actions.show_options(False) if koboldai_vars.prompt == "": koboldai_vars.prompt = koboldai_vars.actions.get_current_options()[int(data['option'])]['text'] koboldai_vars.actions.clear_unused_options() @@ -6169,23 +6143,31 @@ def UI_2_delete_option(data): @socketio.on('submit') @logger.catch def UI_2_submit(data): - if not koboldai_vars.noai and data['theme'] != "": + if not koboldai_vars.noai and data['theme']: + # Random prompt generation logger.debug("doing random prompt") memory = koboldai_vars.memory koboldai_vars.memory = "{}\n\nYou generate the following {} story concept :".format(koboldai_vars.memory, data['theme']) koboldai_vars.lua_koboldbridge.feedback = None actionsubmit("", force_submit=True, force_prompt_gen=True) koboldai_vars.memory = memory - else: - logger.debug("doing normal input") - koboldai_vars.actions.clear_unused_options() - koboldai_vars.lua_koboldbridge.feedback = None - koboldai_vars.recentrng = koboldai_vars.recentrngm = None - if koboldai_vars.actions.action_count == -1: - actionsubmit(data['data'], actionmode=koboldai_vars.actionmode) - else: - actionsubmit(data['data'], actionmode=koboldai_vars.actionmode) - + return + + logger.debug("doing normal input") + koboldai_vars.actions.clear_unused_options() + koboldai_vars.lua_koboldbridge.feedback = None + koboldai_vars.recentrng = koboldai_vars.recentrngm = None + + gen_mode_name = data.get("gen_mode", None) or "standard" + try: + gen_mode = GenerationMode(gen_mode_name) + except ValueError: + # Invalid enum lookup! + gen_mode = GenerationMode.STANDARD + logger.warning(f"Unknown gen_mode '{gen_mode_name}', using STANDARD! Report this!") + + actionsubmit(data['data'], actionmode=koboldai_vars.actionmode, gen_mode=gen_mode) + #==================================================================# # Event triggered when user clicks the submit button #==================================================================# @@ -6279,7 +6261,7 @@ def UI_2_select_model(data): #so we'll just go through all the possible loaders for model_backend in sorted( model_backends, - key=lambda x: model_backend_module_names[x] in PRIORITIZED_BACKEND_MODULES, + key=lambda x: PRIORITIZED_BACKEND_MODULES.get(model_backend_module_names[x], 0), reverse=True, ): if model_backends[model_backend].is_valid(data["name"], data["path"] if 'path' in data else None, data["menu"]): @@ -6715,11 +6697,18 @@ def UI_2_set_wi_image(uid): except FileNotFoundError: pass else: - # Otherwise assign image - with open(path, "wb") as file: - file.write(data) + try: + # Otherwise assign image + with open(path, "wb") as file: + file.write(data) + except FileNotFoundError: + show_error_notification( + "Unable to write image", + "Please save the game before uploading images." + ) + return ":(", 500 koboldai_vars.gamesaved = False - return ":)" + return ":)", 200 @app.route("/get_wi_image/", methods=["GET"]) @require_allowed_ip @@ -7336,7 +7325,7 @@ def generate_image(prompt: str) -> Optional[Image.Image]: if koboldai_vars.img_gen_priority == 4: # Check if stable-diffusion-webui API option selected and use that if found. return text2img_api(prompt) - elif ((not koboldai_vars.hascuda or not os.path.exists("models/stable-diffusion-v1-4")) and koboldai_vars.img_gen_priority != 0) or koboldai_vars.img_gen_priority == 3: + elif ((not koboldai_vars.hascuda or not os.path.exists("functional_models/stable-diffusion")) and koboldai_vars.img_gen_priority != 0) or koboldai_vars.img_gen_priority == 3: # If we don't have a GPU, use horde if we're allowed to return text2img_horde(prompt) @@ -7362,7 +7351,7 @@ def text2img_local(prompt: str) -> Optional[Image.Image]: logger.debug("Generating Image") from diffusers import StableDiffusionPipeline if koboldai_vars.image_pipeline is None: - pipe = tpool.execute(StableDiffusionPipeline.from_pretrained, "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, cache="functional_models/stable-diffusion").to("cuda") + pipe = tpool.execute(StableDiffusionPipeline.from_pretrained, "XpucT/Deliberate", safety_checker=None, torch_dtype=torch.float16, cache="functional_models/stable-diffusion").to("cuda") else: pipe = koboldai_vars.image_pipeline.to("cuda") logger.debug("time to load: {}".format(time.time() - start_time)) @@ -7784,9 +7773,16 @@ def UI_2_update_tokens(data): def UI_2_privacy_mode(data): if data['enabled']: koboldai_vars.privacy_mode = True + return + + if data['password'] == koboldai_vars.privacy_password: + koboldai_vars.privacy_mode = False else: - if data['password'] == koboldai_vars.privacy_password: - koboldai_vars.privacy_mode = False + logger.warning("Watch out! Someone tried to unlock your instance with an incorrect password! Stay on your toes...") + show_error_notification( + title="Invalid password", + text="The password you provided was incorrect. Please try again." + ) #==================================================================# # Genres @@ -8236,6 +8232,7 @@ class WorldInfoUIDsSchema(WorldInfoEntriesUIDsSchema): class ModelSelectionSchema(KoboldSchema): model: str = fields.String(required=True, validate=validate.Regexp(r"^(?!\s*NeoCustom)(?!\s*GPT2Custom)(?!\s*TPUMeshTransformerGPTJ)(?!\s*TPUMeshTransformerGPTNeoX)(?!\s*GooseAI)(?!\s*OAI)(?!\s*InferKit)(?!\s*Colab)(?!\s*API).*$"), metadata={"description": 'Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model'}) + backend: Optional[str] = fields.String(required=False, validate=validate.OneOf(model_backends.keys())) def _generate_text(body: GenerationInputSchema): if koboldai_vars.aibusy or koboldai_vars.genseqs: @@ -8493,6 +8490,7 @@ def put_model(body: ModelSelectionSchema): summary: Load a model description: |-2 Loads a model given its Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model. + Optionally, a backend parameter can be passed in to dictate which backend loads the model. tags: - model requestBody: @@ -8502,6 +8500,7 @@ def put_model(body: ModelSelectionSchema): schema: ModelSelectionSchema example: model: ReadOnly + backend: Read Only responses: 200: description: Successful request @@ -8519,8 +8518,18 @@ def put_model(body: ModelSelectionSchema): set_aibusy(1) old_model = koboldai_vars.model koboldai_vars.model = body.model.strip() + + backend = getattr(body, "backend", None) + if not backend: + # Backend is optional for backwards compatibility; it should probably be + # required on the next major API version. + if body.model == "ReadOnly": + backend = "Read Only" + else: + backend = "Huggingface" + try: - load_model(use_breakmodel_args=True, breakmodel_args_default_to_cpu=True) + load_model(backend) except Exception as e: koboldai_vars.model = old_model raise e @@ -8808,8 +8817,14 @@ def get_story(): chunks = [] if koboldai_vars.gamestarted: chunks.append({"num": 0, "text": koboldai_vars.prompt}) - for num, action in koboldai_vars.actions.items(): - chunks.append({"num": num + 1, "text": action}) + + last_action_num = list(koboldai_vars.actions.actions.keys())[-1] + for num, action in koboldai_vars.actions.actions.items(): + text = action["Selected Text"] + # The last action seems to always be empty + if not text and num == last_action_num: + continue + chunks.append({"num": num + 1, "text": text}) return {"results": chunks} @@ -8833,7 +8848,7 @@ def get_story_nums(): chunks = [] if koboldai_vars.gamestarted: chunks.append(0) - for num in koboldai_vars.actions.keys(): + for num in koboldai_vars.actions.actions.keys(): chunks.append(num + 1) return {"results": chunks} @@ -9194,7 +9209,7 @@ def get_world_info(): if wi["folder"] != last_folder: folder = [] if wi["folder"] is not None: - folders.append({"uid": wi["folder"], "name": koboldai_vars.wifolders_d[wi["folder"]]["name"], "entries": folder}) + folders.append({"uid": wi["folder"], "name": koboldai_vars.wifolders_d[str(wi["folder"])]["name"], "entries": folder}) last_folder = wi["folder"] (folder if wi["folder"] is not None else entries).append({k: v for k, v in wi.items() if k not in ("init", "folder", "num") and (wi["selective"] or k != "keysecondary")}) return {"folders": folders, "entries": entries} @@ -10905,8 +10920,8 @@ def run(): if not koboldai_vars.use_colab_tpu and args.model: # If we're using a TPU our UI will freeze during the connection to the TPU. To prevent this from showing to the user we # delay the display of this message until after that step - logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}") - logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui") + logger.message(f"KoboldAI is still loading your model but available at the following link: {cloudflare}") + logger.message(f"KoboldAI is still loading your model but available at the following link for the Classic UI: {cloudflare}/classic") logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite") logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Loading Model...]") logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.") diff --git a/colab/GPU.ipynb b/colab/GPU.ipynb index 78219b06..7232f81b 100644 --- a/colab/GPU.ipynb +++ b/colab/GPU.ipynb @@ -80,7 +80,7 @@ "#@title <-- Select your model below and then click this to start KoboldAI\n", "#@markdown You can find a description of the models below along with instructions on how to start KoboldAI.\n", "\n", - "Model = \"Nerys V2 6B\" #@param [\"Nerys V2 6B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Nerys 2.7B\", \"AID 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"OPT 2.7B\", \"Fairseq Dense 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n", + "Model = \"Nerys V2 6B\" #@param [\"MythoMax 13B (United)\", \"Huginn 13B (United)\", \"Chronos 13B (United)\", \"Airoboros M2.0 13B (United)\", \"Holodeck 13B (United)\", \"Spring Dragon 13B (United)\", \"Nerys V2 6B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Nerys 2.7B\", \"AID 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"OPT 2.7B\", \"Fairseq Dense 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n", "Version = \"Official\" #@param [\"Official\", \"United\"] {allow-input: true}\n", "Provider = \"Cloudflare\" #@param [\"Localtunnel\", \"Cloudflare\"]\n", "use_google_drive = True #@param {type:\"boolean\"}\n", @@ -146,6 +146,36 @@ " Model = \"EleutherAI/gpt-neo-2.7B\"\n", " path = \"\"\n", " download = \"\"\n", + "elif Model == \"Huginn 13B (United)\":\n", + " Model = \"The-Face-Of-Goonery/Huginn-13b-v1.2\"\n", + " path = \"\"\n", + " download = \"\"\n", + " Version = \"United\"\n", + "elif Model == \"Chronos 13B (United)\":\n", + " Model = \"elinas/chronos-13b-v2\"\n", + " path = \"\"\n", + " download = \"\"\n", + " Version = \"United\"\n", + "elif Model == \"Airoboros M2.0 13B (United)\":\n", + " Model = \"jondurbin/airoboros-l2-13b-gpt4-m2.0\"\n", + " path = \"\"\n", + " download = \"\"\n", + " Version = \"United\"\n", + "elif Model == \"MythoMax 13B (United)\":\n", + " Model = \"Gryphe/MythoMax-L2-13b\"\n", + " path = \"\"\n", + " download = \"\"\n", + " Version = \"United\"\n", + "elif Model == \"Spring Dragon 13B (United)\":\n", + " Model = \"Henk717/spring-dragon\"\n", + " path = \"\"\n", + " download = \"\"\n", + " Version = \"United\"\n", + "elif Model == \"Holodeck 13B (United)\":\n", + " Model = \"KoboldAI/LLAMA2-13B-Holodeck-1\"\n", + " path = \"\"\n", + " download = \"\"\n", + " Version = \"United\"\n", "\n", "if Provider == \"Localtunnel\":\n", " tunnel = \"--localtunnel yes\"\n", @@ -193,6 +223,20 @@ "metadata": { "id": "Lrm840I33hkC" } + }, + { + "cell_type": "code", + "source": [ + "#@title Model Cleaner\n", + "#@markdown Out of space? Run this to remove all cached models (Google Drive models are not effected).\n", + "!rm -rf /content/KoboldAI-Client/cache/*\n" + ], + "metadata": { + "cellView": "form", + "id": "5k8fK4F6UiTs" + }, + "execution_count": null, + "outputs": [] } ] } \ No newline at end of file diff --git a/environments/huggingface.yml b/environments/huggingface.yml index e97f3e2e..004c7ecc 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -47,10 +47,10 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - - --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html - - gptq_koboldai==0.0.6 + - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/henk717/KoboldAI/releases/download/Snapshot-11-08-23/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 - scipy - - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html - - exllama==0.0.6 diff --git a/gensettings.py b/gensettings.py index 8d68b4b5..4b395266 100644 --- a/gensettings.py +++ b/gensettings.py @@ -7,7 +7,7 @@ gensettingstf = [ "min": 16, "max": 512, "step": 2, - "default": 80, + "default": 200, "tooltip": "Number of tokens to be generated. Higher values will take longer to generate.", "menu_path": "Settings", "sub_path": "Generation", @@ -182,9 +182,9 @@ gensettingstf = [ "label": "Context Tokens", "id": "settknmax", "min": 512, - "max": 2048, + "max": 4096, "step": 8, - "default": 1024, + "default": 2048, "tooltip": "Number of context tokens to submit to the AI for sampling. Make sure this is higher than Output Length. Higher values increase VRAM/RAM usage.", "menu_path": "Settings", "sub_path": "Generation", @@ -296,7 +296,7 @@ gensettingstf = [ "max": 1, "step": 1, "default": 0, - "tooltip": "Scans the AI's output for World Info keys as it is generating the one.", + "tooltip": "Look for World Info keys in the AI's response while it is still being generated.", "menu_path": "World Info", "sub_path": "", "classname": "story", @@ -413,6 +413,23 @@ gensettingstf = [ , "ui_level": 2 }, + { + "UI_V2_Only": True, + "uitype": "toggle", + "unit": "bool", + "label": "Smooth Streaming", + "id": "smoothstreaming", + "min": 0, + "max": 1, + "step": 1, + "default": 0, + "tooltip": "Makes Token Streaming type in characters, not tokens. Note that this is purely visual, and will likely increase delay in seeing the tokens.", + "menu_path": "Interface", + "sub_path": "UI", + "classname": "user", + "name": "smooth_streaming", + "ui_level": 1 + }, { "uitype": "toggle", "unit": "bool", @@ -739,7 +756,7 @@ gensettingstf = [ "max": 1, "step": 1, "default": 0, - "tooltip": "If enabled, experimental features will be displayed in the UI.", + "tooltip": "If enabled, experimental features will be displayed in the UI. Note: These features have been determined to be too unstable for standard use, and may corrupt your data. You're on your own from here.", "menu_path": "Interface", "sub_path": "UI", "classname": "system", diff --git a/koboldai_settings.py b/koboldai_settings.py index ebd8c019..62e4918d 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -6,7 +6,7 @@ import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys import shutil from typing import List, Union from io import BytesIO -from flask import has_request_context, session +from flask import has_request_context, session, request from flask_socketio import join_room, leave_room from collections import OrderedDict import multiprocessing @@ -130,11 +130,14 @@ class koboldai_vars(object): original_story_name = story_name if not multi_story: story_name = 'default' - #Leave the old room and join the new one - logger.debug("Leaving room {}".format(session['story'])) - leave_room(session['story']) - logger.debug("Joining room {}".format(story_name)) - join_room(story_name) + + # Leave the old room and join the new one if in socket context + if hasattr(request, "sid"): + logger.debug("Leaving room {}".format(session['story'])) + leave_room(session['story']) + logger.debug("Joining room {}".format(story_name)) + join_room(story_name) + session['story'] = story_name logger.debug("Sending story reset") self._story_settings[story_name]._socketio.emit("reset_story", {}, broadcast=True, room=story_name) @@ -653,7 +656,7 @@ class model_settings(settings): 'welcome', 'welcome_default', 'simple_randomness', 'simple_creativity', 'simple_repitition', 'badwordsids', 'uid_presets', 'model', 'model_type', 'lazy_load', 'fp32_model', 'modeldim', 'horde_wait_time', 'horde_queue_position', 'horde_queue_size', 'newlinemode', 'tqdm_progress', 'tqdm_rem_time', '_tqdm'] settings_name = "model" - default_settings = {"rep_pen" : 1.1, "rep_pen_slope": 0.7, "rep_pen_range": 1024, "temp": 0.5, "top_p": 0.9, "top_k": 0, "top_a": 0.0, "tfs": 1.0, "typical": 1.0, + default_settings = {"rep_pen" : 1.1, "rep_pen_slope": 1.0, "rep_pen_range": 2048, "temp": 0.5, "top_p": 0.9, "top_k": 0, "top_a": 0.0, "tfs": 1.0, "typical": 1.0, "sampler_order": [6,0,1,2,3,4,5]} def __init__(self, socketio, koboldai_vars): self.enable_whitelist = False @@ -677,7 +680,7 @@ class model_settings(settings):
Please load a model from the left.
- If you encounter any issues, please click the Download debug dump link in the Home tab on the left flyout and attach the downloaded file to your error report on Github, Reddit, or Discord. + If you encounter any issues, please click the Download debug dump link in the Home tab on the left flyout and attach the downloaded file to your error report on Github, Reddit, or Discord. A redacted version (without story text) is available.
""" # Custom Welcome Text @@ -685,18 +688,19 @@ class model_settings(settings): self._koboldai_vars = koboldai_vars self.alt_multi_gen = False self.bit_8_available = None + self.supported_gen_modes = [] def reset_for_model_load(self): self.simple_randomness = 0 #Set first as this affects other outputs self.simple_creativity = 0 #Set first as this affects other outputs self.simple_repitition = 0 #Set first as this affects other outputs - self.max_length = 1024 # Maximum number of tokens to submit per action + self.max_length = 2048 # Maximum number of tokens to submit per action self.ikmax = 3000 # Maximum number of characters to submit to InferKit - self.genamt = 80 # Amount of text for each action to generate + self.genamt = 200 # Amount of text for each action to generate self.ikgen = 200 # Number of characters for InferKit to generate self.rep_pen = 1.1 # Default generator repetition_penalty - self.rep_pen_slope = 0.7 # Default generator repetition penalty slope - self.rep_pen_range = 1024 # Default generator repetition penalty range + self.rep_pen_slope = 1.0 # Default generator repetition penalty slope + self.rep_pen_range = 2048 # Default generator repetition penalty range self.temp = 0.5 # Default generator temperature self.top_p = 0.9 # Default generator top_p self.top_k = 0 # Default generator top_k @@ -1155,6 +1159,7 @@ class user_settings(settings): self.nogenmod = False self.debug = False # If set to true, will send debug information to the client for display self.output_streaming = True + self.smooth_streaming = True self.show_probs = False # Whether or not to show token probabilities self.beep_on_complete = False self.img_gen_priority = 1 @@ -1755,11 +1760,15 @@ class KoboldStoryRegister(object): def go_forward(self): action_step = self.action_count+1 - if action_step in self.actions: - if len(self.get_current_options()) == 1: - logger.warning("Going forward with this text: {}".format(self.get_current_options()[0]["text"])) - self.use_option([x['text'] for x in self.actions[action_step]["Options"]].index(self.get_current_options()[0]["text"])) - + if action_step not in self.actions: + return + + self.show_options(len(self.get_current_options()) > 1) + + if len(self.get_current_options()) == 1: + logger.warning("Going forward with this text: {}".format(self.get_current_options()[0]["text"])) + self.use_option([x['text'] for x in self.actions[action_step]["Options"]].index(self.get_current_options()[0]["text"])) + def use_option(self, option_number, action_step=None): if action_step is None: action_step = self.action_count+1 @@ -1797,6 +1806,16 @@ class KoboldStoryRegister(object): process_variable_changes(self._socketio, "story", 'actions', {"id": action_step, 'action': self.actions[action_step]}, None) self.set_game_saved() + def show_options( + self, + should_show: bool, + force: bool = False, + + ) -> None: + if self._koboldai_vars.aibusy and not force: + return + self._socketio.emit("show_options", should_show, broadcast=True, room="UI_2") + def delete_action(self, action_id, keep=True): if action_id in self.actions: old_options = copy.deepcopy(self.actions[action_id]["Options"]) @@ -1889,34 +1908,19 @@ class KoboldStoryRegister(object): process_variable_changes(self._socketio, "story", 'actions', {"id": self.action_count+1, 'action': self.actions[self.action_count+1]}, None) else: #We're streaming single options so our output is our selected - #First we need to see if this is actually the prompt. If so we'll just not do streaming: - if self.story_settings.prompt != "": - if self.action_count+1 in self.actions: - if self._koboldai_vars.tokenizer is not None: - selected_text_length = len(self._koboldai_vars.tokenizer.encode(self.actions[self.action_count+1]['Selected Text'])) - else: - selected_text_length = 0 - self.actions[self.action_count+1]['Selected Text'] = "{}{}".format(self.actions[self.action_count+1]['Selected Text'], text_list[0]) - self.actions[self.action_count+1]['Selected Text Length'] = selected_text_length - else: - if self._koboldai_vars.tokenizer is not None: - selected_text_length = len(self._koboldai_vars.tokenizer.encode(text_list[0])) - else: - selected_text_length = 0 - self.actions[self.action_count+1] = {"Selected Text": text_list[0], "Selected Text Length": selected_text_length, "Options": [], "Time": int(time.time())} - - - - if self._koboldai_vars.tokenizer is not None: - if len(self._koboldai_vars.tokenizer.encode(self.actions[self.action_count+1]['Selected Text'])) != self._koboldai_vars.genamt: - #ui1 - if queue is not None: - queue.put(["from_server", {"cmd": "streamtoken", "data": [{ - "decoded": text_list[0], - "probabilities": self.probability_buffer - }]}, {"broadcast":True, "room":"UI_1"}]) - #process_variable_changes(self._socketio, "actions", "Options", {"id": self.action_count+1, "options": self.actions[self.action_count+1]["Options"]}, {"id": self.action_count+1, "options": None}) - process_variable_changes(self._socketio, "story", 'actions', {"id": self.action_count+1, 'action': self.actions[self.action_count+1]}, None) + queue.put(["stream_tokens", text_list, {"broadcast": True, "room": "UI_2"}]) + + # UI1 + queue.put([ + "from_server", { + "cmd": "streamtoken", + "data": [{ + "decoded": text_list[0], + "probabilities": self.probability_buffer + }], + }, + {"broadcast":True, "room": "UI_1"} + ]) def set_probabilities(self, probabilities, action_id=None): self.probability_buffer = probabilities diff --git a/modeling/inference_model.py b/modeling/inference_model.py index a2d4fa63..48691bce 100644 --- a/modeling/inference_model.py +++ b/modeling/inference_model.py @@ -3,6 +3,8 @@ from __future__ import annotations from dataclasses import dataclass import time from typing import List, Optional, Union + +from enum import Enum from logger import logger import torch @@ -12,6 +14,7 @@ from transformers import ( GPT2Tokenizer, AutoTokenizer, ) +from modeling.stoppers import Stoppers from modeling.tokenizer import GenericTokenizer from modeling import logits_processors @@ -144,7 +147,10 @@ class GenerationSettings: class ModelCapabilities: embedding_manipulation: bool = False post_token_hooks: bool = False + + # Used to gauge if manual stopping is possible stopper_hooks: bool = False + # TODO: Support non-live probabilities from APIs post_token_probs: bool = False @@ -154,6 +160,12 @@ class ModelCapabilities: # Some models need to warm up the TPU before use uses_tpu: bool = False +class GenerationMode(Enum): + STANDARD = "standard" + FOREVER = "forever" + UNTIL_EOS = "until_eos" + UNTIL_NEWLINE = "until_newline" + UNTIL_SENTENCE_END = "until_sentence_end" class InferenceModel: """Root class for all models.""" @@ -256,6 +268,7 @@ class InferenceModel: self, text: list, found_entries: set, + gen_mode: GenerationMode = GenerationMode.STANDARD, ): """Generate story text. Heavily tied to story-specific parameters; if you are making a new generation-based feature, consider `generate_raw()`. @@ -263,6 +276,7 @@ class InferenceModel: Args: text (list): Encoded input tokens found_entries (set): Entries found for Dynamic WI + gen_mode (GenerationMode): The GenerationMode to pass to raw_generate. Defaults to GenerationMode.STANDARD Raises: RuntimeError: if inconsistancies are detected with the internal state and Lua state -- sanity check @@ -358,6 +372,7 @@ class InferenceModel: seed=utils.koboldai_vars.seed if utils.koboldai_vars.full_determinism else None, + gen_mode=gen_mode ) logger.debug( "core_generate: run raw_generate pass {} {}s".format( @@ -532,6 +547,7 @@ class InferenceModel: found_entries: set = (), tpu_dynamic_inference: bool = False, seed: Optional[int] = None, + gen_mode: GenerationMode = GenerationMode.STANDARD, **kwargs, ) -> GenerationResult: """A wrapper around `_raw_generate()` that handles gen_state and other stuff. Use this to generate text outside of the story. @@ -547,6 +563,7 @@ class InferenceModel: is_core (bool, optional): Whether this generation is a core story generation. Defaults to False. single_line (bool, optional): Generate one line only.. Defaults to False. found_entries (set, optional): Entries found for Dynamic WI. Defaults to (). + gen_mode (GenerationMode): Special generation mode. Defaults to GenerationMode.STANDARD. Raises: ValueError: If prompt type is weird @@ -568,6 +585,29 @@ class InferenceModel: "wi_scanner_excluded_keys", set() ) + self.gen_state["allow_eos"] = False + + temp_stoppers = [] + + if gen_mode not in self.get_supported_gen_modes(): + gen_mode = GenerationMode.STANDARD + logger.warning(f"User requested unsupported GenerationMode '{gen_mode}'!") + + if gen_mode == GenerationMode.FOREVER: + self.gen_state["stop_at_genamt"] = False + max_new = 1e7 + elif gen_mode == GenerationMode.UNTIL_EOS: + self.gen_state["allow_eos"] = True + self.gen_state["stop_at_genamt"] = False + max_new = 1e7 + elif gen_mode == GenerationMode.UNTIL_NEWLINE: + # TODO: Look into replacing `single_line` with `generation_mode` + temp_stoppers.append(Stoppers.newline_stopper) + elif gen_mode == GenerationMode.UNTIL_SENTENCE_END: + temp_stoppers.append(Stoppers.sentence_end_stopper) + + self.stopper_hooks += temp_stoppers + utils.koboldai_vars.inference_config.do_core = is_core gen_settings = GenerationSettings(*(generation_settings or {})) @@ -597,13 +637,21 @@ class InferenceModel: ) time_end = round(time.time() - time_start, 2) - tokens_per_second = round(len(result.encoded[0]) / time_end, 2) + + try: + tokens_per_second = round(len(result.encoded[0]) / time_end, 2) + except ZeroDivisionError: + # Introducing KoboldAI's fastest model: ReadOnly! + tokens_per_second = 0 if not utils.koboldai_vars.quiet: logger.info( f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second." ) + for stopper in temp_stoppers: + self.stopper_hooks.remove(stopper) + return result def generate( @@ -620,3 +668,19 @@ class InferenceModel: def _post_token_gen(self, input_ids: torch.LongTensor) -> None: for hook in self.post_token_hooks: hook(self, input_ids) + + def get_supported_gen_modes(self) -> List[GenerationMode]: + """Returns a list of compatible `GenerationMode`s for the current model. + + Returns: + List[GenerationMode]: A list of compatible `GenerationMode`s. + """ + ret = [GenerationMode.STANDARD] + + if self.capabilties.stopper_hooks: + ret += [ + GenerationMode.FOREVER, + GenerationMode.UNTIL_NEWLINE, + GenerationMode.UNTIL_SENTENCE_END, + ] + return ret \ No newline at end of file diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 1bf38b99..a059ebb0 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -27,6 +27,10 @@ model_backend_name = "Huggingface" model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) class model_backend(HFTorchInferenceModel): + def __init__(self) -> None: + super().__init__() + self.use_4_bit = False + def is_valid(self, model_name, model_path, menu_path): base_is_valid = super().is_valid(model_name, model_path, menu_path) path = False @@ -58,15 +62,15 @@ class model_backend(HFTorchInferenceModel): "unit": "text", "label": "Quantization", "id": "quantization", - "default": temp['quantization'] if 'quantization' in temp else 'none', + "default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit', "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode", "menu_path": "Layers", - "children": [{'text': 'None', 'value':'none'},{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}], + "children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}], "extra_classes": "", "refresh_model_inputs": False }) else: - logger.warning("Bitsandbytes is not installed, you can not use Huggingface models in 4-bit") + logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models") return requested_parameters def set_input_parameters(self, parameters): @@ -124,7 +128,8 @@ class model_backend(HFTorchInferenceModel): # We must disable low_cpu_mem_usage and if using a GPT-2 model # because GPT-2 is not compatible with this feature yet. tf_kwargs.pop("low_cpu_mem_usage", None) - + tf_kwargs.pop("quantization_config", None) + # Also, lazy loader doesn't support GPT-2 models self.lazy_load = False diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 81a33c70..b48f1d56 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -7,7 +7,7 @@ import torch import re import shutil import sys -from typing import Union +from typing import Dict, Union import utils import modeling.lazy_loader as lazy_loader @@ -82,13 +82,109 @@ def get_gptq_version(fpath): logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") return 0, False +def load_quant_offload_device_map( + load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False, +): + from gptq.offload import ( + find_layers, + llama_offload_forward, + gptneox_offload_forward, + gptj_offload_forward, + opt_offload_forward, + bigcode_offload_forward + ) + from transformers.models.llama.modeling_llama import LlamaModel + from transformers.models.opt.modeling_opt import OPTModel + from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel + from transformers.models.gptj.modeling_gptj import GPTJModel + from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel + model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias) + + m, layers, remaining = find_layers(model) + type(m).non_offload_forward = type(m).forward + + # Hook offload_forward into found model + if type(m) == LlamaModel: + type(m).forward = llama_offload_forward + elif type(m) == GPTNeoXModel: + type(m).forward = gptneox_offload_forward + elif type(m) == GPTJModel: + type(m).forward = gptj_offload_forward + elif type(m) == OPTModel: + type(m).forward = opt_offload_forward + elif type(m) == GPTBigCodeModel: + type(m).forward = bigcode_offload_forward + else: + raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader") + + layers_done = len([1 for v in device_map.values() if v != "cpu"]) + + m.cpu_device = torch.device("cpu") + m.fast_offload = layers_done > len(layers) // 2 + m.layer_count = len(layers) + m.cpu_layers = len(layers) - layers_done + m.gpu_layers = layers_done + m.offload_type = offload_type + # HACK + m.primary_gpu = list(device_map.values())[0] + + if "layers" not in dir(m): + m.layers = layers + + for i in range(len(layers)): + dev = None + for key, device in device_map.items(): + key = int(*[x for x in key.split(".") if x.isdecimal()]) + if key == i: + dev = device + break + if dev is None: + raise ValueError + layers[key].to(dev, torch.float16, False) + + for module in remaining: + module.to(m.primary_gpu) + + return model + class model_backend(HFTorchInferenceModel): def is_valid(self, model_name, model_path, menu_path): gptq_model, _, _, _, _ = load_model_gptq_settings(model_path) return bool(gptq_model) + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + requested_parameters = super().get_requested_parameters(model_name, model_path, menu_path, parameters) + if model_name != 'customhuggingface' or "custom_model_name" in parameters: + if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self): + with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f: + temp = json.load(f) + else: + temp = {} + requested_parameters.append({ + "uitype": "dropdown", + "unit": "text", + "label": "Implementation", + "id": "implementation", + "default": temp['implementation'] if 'implementation' in temp else 'occam', + "tooltip": "Which GPTQ provider to use?", + "menu_path": "Layers", + "children": [{'text': 'Occam GPTQ', 'value': 'occam'}, {'text': 'AutoGPTQ', 'value': 'AutoGPTQ'}], + "extra_classes": "", + "refresh_model_inputs": False + }) + return requested_parameters + + def set_input_parameters(self, parameters): + super().set_input_parameters(parameters) + self.implementation = parameters['implementation'] if 'implementation' in parameters else "occam" + def _load(self, save_model: bool, initial_load: bool) -> None: + try: + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM + # Make model path the same as the model name to make this consistent # with the other loading method if it isn't a known model type. This # code is not just a workaround for below, it is also used to make the @@ -98,7 +194,7 @@ class model_backend(HFTorchInferenceModel): self.init_model_config() - self.lazy_load = False + self.lazy_load = True gpulayers = self.breakmodel_config.gpu_blocks @@ -107,10 +203,6 @@ class model_backend(HFTorchInferenceModel): except (ValueError, AttributeError): self.gpu_layers_list = [utils.num_layers(self.model_config)] - tf_kwargs = { - "low_cpu_mem_usage": True, - } - # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) @@ -123,9 +215,6 @@ class model_backend(HFTorchInferenceModel): self.breakmodel_device_config(self.model_config) if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - # If we're using lazy loader, we need to figure out what the model's hidden layers are called with lazy_loader.use_lazy_load(dematerialized_modules=True): try: @@ -141,7 +230,7 @@ class model_backend(HFTorchInferenceModel): if self.get_local_model_path(): # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.model = self._get_model(self.get_local_model_path()) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) else: raise NotImplementedError("GPTQ Model downloading not implemented") @@ -161,7 +250,58 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() - def _get_model(self, location: str, tf_kwargs: Dict): + def _patch_quant(self, device_map, quant_module) -> None: + def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs): + if isinstance(module, quant_module.QuantLinear): + return + + for attr in dir(module): + tmp = getattr(module, attr) + name1 = name + '.' + attr if name != '' else attr + if name1 in names: + parts = name1.split(".") + device = None + for i in reversed(range(len(parts))): + maybe_key = ".".join(parts[:i]) + if maybe_key in device_map: + device = device_map[maybe_key] + break + + if device is None: + raise ValueError(f"No device for {name1}") + + delattr(module, attr) + + ql = quant_module.QuantLinear( + bits, + groupsize, + tmp.in_features, + tmp.out_features, + force_bias or tmp.bias is not None, + **kwargs, + ) + ql = ql.to(device) + + setattr(module, attr, ql) + + for name1, child in module.named_children(): + make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias) + + quant_module.make_quant = make_quant + + + def _patch_quants(self, device_map) -> None: + # Load QuantLinears on the device corresponding to the device map + + from gptq import quant_v3 + from gptq import quant_v2 + from gptq import quant_v1 + + for quant_module in [quant_v3, quant_v2, quant_v1]: + self._patch_quant(device_map, quant_module) + + + def _get_model(self, location: str): import gptq from gptq.gptj import load_quant as gptj_load_quant from gptq.gptneox import load_quant as gptneox_load_quant @@ -169,7 +309,12 @@ class model_backend(HFTorchInferenceModel): from gptq.opt import load_quant as opt_load_quant from gptq.bigcode import load_quant as bigcode_load_quant from gptq.mpt import load_quant as mpt_load_quant - from gptq.offload import load_quant_offload + + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) v2_bias = False @@ -181,50 +326,77 @@ class model_backend(HFTorchInferenceModel): model_type = self.get_model_type() logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") - if model_type == "gptj": - model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "llama": - model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "opt": - model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "mpt": - model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "gpt_bigcode": - model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() - else: - try: - import auto_gptq - from auto_gptq import AutoGPTQForCausalLM - except ImportError: - raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - try: - import hf_bleeding_edge - from hf_bleeding_edge import AutoModelForCausalLM - except ImportError: - from transformers import AutoModelForCausalLM + device_map = {} - # Monkey patch in hf_bleeding_edge to avoid having to trust remote code - auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig - auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig - auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM - model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors")) + if self.lazy_load: + with lazy_loader.use_lazy_load(dematerialized_modules=True): + metamodel = AutoModelForCausalLM.from_config(self.model_config) + if utils.args.cpu: + device_map = {name: "cpu" for name in utils.layers_module_names} + for name in utils.get_missing_module_names( + metamodel, list(device_map.keys()) + ): + device_map[name] = "cpu" + else: + device_map = self.breakmodel_config.get_device_map( + metamodel + ) - # Patch in embeddings function - def get_input_embeddings(self): - return self.model.get_input_embeddings() + self._patch_quants(device_map) - type(model).get_input_embeddings = get_input_embeddings + with lazy_loader.use_lazy_load( + enable=self.lazy_load, + dematerialized_modules=False, + ): + if self.implementation == "occam": + try: + if model_type == "gptj": + model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) + elif model_type == "gpt_neox": + model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) + elif model_type == "llama": + model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) + elif model_type == "opt": + model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) + elif model_tseype == "mpt": + model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) + elif model_type == "gpt_bigcode": + model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half() + else: + raise RuntimeError("Model not supported by Occam's GPTQ") + except: + self.implementation = "AutoGPTQ" + if self.implementation == "AutoGPTQ": + try: + import auto_gptq + from auto_gptq import AutoGPTQForCausalLM + except ImportError: + raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - # Patch in args support.. - def generate(self, *args, **kwargs): - """shortcut for model.generate""" - with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): - return self.model.generate(*args, **kwargs) + # Monkey patch in hf_bleeding_edge to avoid having to trust remote code + auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM - type(model).generate = generate + try: + model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"), device_map=device_map) + except: + model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"), device_map=device_map, disable_exllama=True) + + # Patch in embeddings function + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + type(model).get_input_embeddings = get_input_embeddings + + # Patch in args support.. + def generate(self, *args, **kwargs): + """shortcut for model.generate""" + with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): + return self.model.generate(*args, **kwargs) + + type(model).generate = generate return model diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 60b69476..167716d4 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -19,8 +19,12 @@ class HFInferenceModel(InferenceModel): def __init__(self) -> None: super().__init__() self.model_config = None - #self.model_name = model_name + # TODO: model_name should probably be an instantiation parameter all the + # way down the inheritance chain. + self.model_name = None + + self.path = None self.hf_torch = False self.model = None self.tokenizer = None @@ -217,6 +221,11 @@ class HFInferenceModel(InferenceModel): torch.cuda.empty_cache() except: pass + + def _pre_load(self) -> None: + # HACK: Make model instantiation work without UI parameters + self.model_name = self.model_name or utils.koboldai_vars.model + return super()._pre_load() def _post_load(self) -> None: self.badwordsids = koboldai_settings.badwordsids_default diff --git a/modeling/inference_models/hf_mtj/class.py b/modeling/inference_models/hf_mtj/class.py index c0f70843..a4600465 100644 --- a/modeling/inference_models/hf_mtj/class.py +++ b/modeling/inference_models/hf_mtj/class.py @@ -133,7 +133,8 @@ class model_backend(HFInferenceModel): utils.koboldai_vars.compiling = True def mtj_stopped_compiling_callback() -> None: - print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END) + if utils.koboldai_vars.compiling: + print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END) utils.koboldai_vars.compiling = False def mtj_settings_callback() -> dict: diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 6372858f..82e60304 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -34,6 +34,7 @@ from modeling.stoppers import Stoppers from modeling.post_token_hooks import PostTokenHooks from modeling.inference_models.hf import HFInferenceModel from modeling.inference_model import ( + GenerationMode, GenerationResult, GenerationSettings, ModelCapabilities, @@ -92,7 +93,11 @@ class HFTorchInferenceModel(HFInferenceModel): self.hf_torch = True self.lazy_load = True self.low_mem = False + + # `nobreakmodel` indicates that breakmodel cannot be used, while `breakmodel` + # indicates whether breakmodel is currently being used self.nobreakmodel = False + self.breakmodel = False self.post_token_hooks = [ PostTokenHooks.stream_tokens, @@ -126,8 +131,13 @@ class HFTorchInferenceModel(HFInferenceModel): return ret def get_auxilary_device(self) -> Union[str, int, torch.device]: - return self.breakmodel_config.primary_device - + if self.breakmodel: + return self.breakmodel_config.primary_device + if self.usegpu: + return "cuda:0" + else: + return "cpu" + def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: if self.breakmodel_config.primary_device == "cpu": return torch.float32 @@ -228,9 +238,6 @@ class HFTorchInferenceModel(HFInferenceModel): ) class KoboldLogitsWarperList(LogitsProcessorList): - def __init__(self): - pass - def __call__( lw_self, input_ids: torch.LongTensor, @@ -247,17 +254,14 @@ class HFTorchInferenceModel(HFInferenceModel): ), f"Scores are None; processor '{processor}' is to blame" return scores - def new_get_logits_warper( - beams: int = 1, - ) -> LogitsProcessorList: - return KoboldLogitsWarperList() - def new_sample(self, *args, **kwargs): assert kwargs.pop("logits_warper", None) is not None - kwargs["logits_warper"] = new_get_logits_warper( - beams=1, - ) - if utils.koboldai_vars.newlinemode in ["s", "ns"]: + kwargs["logits_warper"] = KoboldLogitsWarperList() + + if ( + utils.koboldai_vars.newlinemode in ["s", "ns"] + and not m_self.gen_state["allow_eos"] + ): kwargs["eos_token_id"] = -1 kwargs.setdefault("pad_token_id", 2) return new_sample.old_sample(self, *args, **kwargs) @@ -329,7 +333,7 @@ class HFTorchInferenceModel(HFInferenceModel): with torch.no_grad(): start_time = time.time() genout = self.model.generate( - gen_in, + input_ids=gen_in, do_sample=True, max_length=min( len(prompt_tokens) + max_new, utils.koboldai_vars.max_length @@ -608,3 +612,9 @@ class HFTorchInferenceModel(HFInferenceModel): self.breakmodel = False self.usegpu = False return + + def get_supported_gen_modes(self) -> List[GenerationMode]: + # This changes a torch patch to disallow eos as a bad word. + return super().get_supported_gen_modes() + [ + GenerationMode.UNTIL_EOS + ] \ No newline at end of file diff --git a/modeling/inference_models/readonly/class.py b/modeling/inference_models/readonly/class.py index 13c38baf..cbdb298d 100644 --- a/modeling/inference_models/readonly/class.py +++ b/modeling/inference_models/readonly/class.py @@ -1,12 +1,10 @@ from __future__ import annotations import torch -import requests import numpy as np from typing import List, Optional, Union import utils -from logger import logger from modeling.inference_model import ( GenerationResult, GenerationSettings, @@ -15,29 +13,46 @@ from modeling.inference_model import ( ) model_backend_name = "Read Only" -model_backend_type = "Read Only" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) +model_backend_type = "Read Only" # This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) -class BasicAPIException(Exception): - """To be used for errors when using the Basic API as an interface.""" + +class DummyHFTokenizerOut: + input_ids = np.array([[]]) + + +class FacadeTokenizer: + def __init__(self): + self._koboldai_header = [] + + def decode(self, _input): + return "" + + def encode(self, input_text): + return [] + + def __call__(self, *args, **kwargs) -> DummyHFTokenizerOut: + return DummyHFTokenizerOut() class model_backend(InferenceModel): def __init__(self) -> None: super().__init__() - # Do not allow API to be served over the API + # Do not allow ReadOnly to be served over the API self.capabilties = ModelCapabilities(api_host=False) - self.tokenizer = self._tokenizer() + self.tokenizer: FacadeTokenizer = None self.model = None self.model_name = "Read Only" - + def is_valid(self, model_name, model_path, menu_path): return model_name == "ReadOnly" - - def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + + def get_requested_parameters( + self, model_name, model_path, menu_path, parameters={} + ): requested_parameters = [] return requested_parameters - + def set_input_parameters(self, parameters): return @@ -46,17 +61,9 @@ class model_backend(InferenceModel): def _initialize_model(self): return - - class _tokenizer(): - def __init__(self): - self._koboldai_header = [] - def decode(self, _input): - return "" - def encode(self, input_text): - return [] def _load(self, save_model: bool = False, initial_load: bool = False) -> None: - self.tokenizer = self.tokenizer + self.tokenizer = FacadeTokenizer() self.model = None utils.koboldai_vars.noai = True @@ -72,7 +79,7 @@ class model_backend(InferenceModel): ): return GenerationResult( model=self, - out_batches=np.array([]), + out_batches=np.array([[]]), prompt=prompt_tokens, is_whole_generation=True, single_line=single_line, diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 69e0d948..eece7d2f 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -51,15 +51,12 @@ import time import zipfile import pickle import torch -import numpy as np -import collections -import _codecs import os from typing import Any, Callable, Dict, Optional, Tuple, Type -from torch import Tensor from torch.nn import Module from torch.storage import UntypedStorage +from modeling.pickling import RestrictedUnpickler, use_custom_unpickler from modeling.patches import LazyloadPatches # Safetensors is a dependency for the local version, TPU/Colab doesn't @@ -176,9 +173,6 @@ class TorchLazyTensor(LazyTensor): CheckpointChunkCache.key = self.key ziproot = checkpoint.namelist()[0].split("/")[0] CheckpointChunkCache.handle = checkpoint.open(f"{ziproot}/data/{self.key}", "r") - - - else: # Cache hit. Hip hip hooray! :^) # print(".", end="", flush=True) @@ -239,86 +233,11 @@ class SafetensorsLazyTensor(LazyTensor): self.checkpoint_file, tensor_key=self.key, device=self.location ) -def _patched_rebuild_from_type_v2(func, new_type, args, state): - """A patched version of torch._tensor._rebuild_from_type_v2 that - does not attempt to convert `LazyTensor`s to `torch.Tensor`s.""" - - ret = func(*args) - - # BEGIN PATCH - transformation_ok = isinstance(ret, LazyTensor) and new_type == Tensor - if type(ret) is not new_type and not transformation_ok: - # END PATCH - ret = ret.as_subclass(new_type) - - # Tensor does define __setstate__ even though it doesn't define - # __getstate__. So only use __setstate__ if it is NOT the one defined - # on Tensor - if ( - getattr(ret.__class__, "__setstate__", Tensor.__setstate__) - is not Tensor.__setstate__ - ): - ret.__setstate__(state) - else: - ret = torch._utils._set_obj_state(ret, state) - return ret - -class RestrictedUnpickler(pickle.Unpickler): - def original_persistent_load(self, saved_id): - return super().persistent_load(saved_id) - - def forced_persistent_load(self, saved_id): - if saved_id[0] != "storage": - raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'") - return self.original_persistent_load(saved_id) - - def find_class(self, module, name): - if module == "collections" and name == "OrderedDict": - return collections.OrderedDict - elif module == "torch._utils" and name == "_rebuild_tensor_v2": - return torch._utils._rebuild_tensor_v2 - elif module == "torch._tensor" and name == "_rebuild_from_type_v2": - return _patched_rebuild_from_type_v2 - elif module == "torch" and name in ( - "DoubleStorage", - "FloatStorage", - "HalfStorage", - "LongStorage", - "IntStorage", - "ShortStorage", - "CharStorage", - "ByteStorage", - "BoolStorage", - "BFloat16Storage", - "Tensor", - ): - return getattr(torch, name) - elif module == "numpy.core.multiarray" and name == "scalar": - return np.core.multiarray.scalar - elif module == "numpy" and name == "dtype": - return np.dtype - elif module == "_codecs" and name == "encode": - return _codecs.encode - else: - # Forbid everything else. - qualified_name = name if module == "__builtin__" else f"{module}.{name}" - raise pickle.UnpicklingError( - f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}" - ) - - def load(self, *args, **kwargs): - self.original_persistent_load = getattr( - self, "persistent_load", pickle.Unpickler.persistent_load - ) - self.persistent_load = self.forced_persistent_load - return super().load(*args, **kwargs) - class _LazyUnpickler(RestrictedUnpickler): lazy_loaded_storages: Dict[str, LazyTensor] def __init__(self, *args, **kwargs): - # print(args, kwargs) self.lazy_loaded_storages = {} return super().__init__(*args, **kwargs) @@ -376,7 +295,7 @@ def patch_safetensors(callback): # (70 tensors/s -> 65 tensor/s). The memory savings probably # shouldn't be the happening, maybe there's a memory leak # somewhere in our pipeline with CPU tensors. - intermediary_device = "cuda" + intermediary_device = "cuda:0" else: intermediary_device = "cpu" @@ -409,27 +328,9 @@ def patch_safetensors(callback): return tensors transformers.modeling_utils.safe_load_file = safetensors_load + safetensors.torch.load_file = safetensors_load -@contextlib.contextmanager -def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler): - try: - old_unpickler = pickle.Unpickler - pickle.Unpickler = unpickler - - old_pickle_load = pickle.load - - def new_pickle_load(*args, **kwargs): - return pickle.Unpickler(*args, **kwargs).load() - - pickle.load = new_pickle_load - - yield - - finally: - pickle.Unpickler = old_unpickler - pickle.load = old_pickle_load - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): for hook in self._load_state_dict_pre_hooks.values(): hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) diff --git a/modeling/logits_processors.py b/modeling/logits_processors.py index 20a18026..a221b4d9 100644 --- a/modeling/logits_processors.py +++ b/modeling/logits_processors.py @@ -233,6 +233,8 @@ class PhraseBiasLogitsProcessor: token_seqs = self._get_token_sequence(phrase) variant_deltas = {} for token_seq in token_seqs: + if not token_seq: + continue bias_index = self._find_intersection(input_ids, token_seq) # Ensure completion after completion_threshold tokens @@ -267,6 +269,14 @@ class PhraseBiasLogitsProcessor: for batch in range(scores_shape[0]): for token, bias in self._get_biased_tokens(input_ids[batch]).items(): - scores[batch][token] += bias + if bias > 0 and bool(scores[batch][token].isneginf()): + # Adding bias to -inf will do NOTHING!!! So just set it for + # now. There may be more mathishly correct way to do this + # but it'll work. Also, make sure the bias is actually + # positive. Don't give a -inf token more chance by setting + # it to -0.5! + scores[batch][token] = bias + else: + scores[batch][token] += bias return scores diff --git a/modeling/patches.py b/modeling/patches.py index 6e2168f2..5664ec07 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -129,15 +129,33 @@ def patch_transformers_generation() -> None: class LazyloadPatches: + class StateDictFacade(dict): + def __init__(self, state_dict): + self.update(state_dict) + + def __getitem__(self, name): + return super().__getitem__(name).materialize(map_location="cuda:0") + old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model + torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict def __enter__() -> None: transformers.modeling_utils._load_state_dict_into_meta_model = ( LazyloadPatches._load_state_dict_into_meta_model ) + torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict def __exit__(exc_type, exc_value, exc_traceback) -> None: transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict + torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict + + def _torch_load_from_state_dict(self, state_dict, *args, **kwargs): + return LazyloadPatches.torch_old_load_from_state_dict( + self, + LazyloadPatches.StateDictFacade(state_dict), + *args, + **kwargs + ) def _load_state_dict_into_meta_model( model, diff --git a/modeling/pickling.py b/modeling/pickling.py new file mode 100644 index 00000000..ccdb3b40 --- /dev/null +++ b/modeling/pickling.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import collections +import contextlib +import pickle + +import _codecs +import numpy as np +import torch +from torch import Tensor + +import modeling + + +def _patched_rebuild_from_type_v2(func, new_type, args, state): + """A patched version of torch._tensor._rebuild_from_type_v2 that + does not attempt to convert `LazyTensor`s to `torch.Tensor`s.""" + + ret = func(*args) + + # BEGIN PATCH + transformation_ok = isinstance(ret, modeling.lazy_loader.LazyTensor) and new_type == Tensor + if type(ret) is not new_type and not transformation_ok: + # END PATCH + ret = ret.as_subclass(new_type) + + # Tensor does define __setstate__ even though it doesn't define + # __getstate__. So only use __setstate__ if it is NOT the one defined + # on Tensor + if ( + getattr(ret.__class__, "__setstate__", Tensor.__setstate__) + is not Tensor.__setstate__ + ): + ret.__setstate__(state) + else: + ret = torch._utils._set_obj_state(ret, state) + return ret + + +class RestrictedUnpickler(pickle.Unpickler): + def original_persistent_load(self, saved_id): + return super().persistent_load(saved_id) + + def forced_persistent_load(self, saved_id): + if saved_id[0] != "storage": + raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'") + return self.original_persistent_load(saved_id) + + def find_class(self, module, name): + if module == "collections" and name == "OrderedDict": + return collections.OrderedDict + elif module == "torch._utils" and name in ( + "_rebuild_tensor_v2", + "_rebuild_meta_tensor_no_storage", + ): + return getattr(torch._utils, name) + elif module == "torch._tensor" and name == "_rebuild_from_type_v2": + return _patched_rebuild_from_type_v2 + elif module == "torch" and name in ( + "DoubleStorage", + "FloatStorage", + "HalfStorage", + "LongStorage", + "IntStorage", + "ShortStorage", + "CharStorage", + "ByteStorage", + "BoolStorage", + "BFloat16Storage", + "Tensor", + "float16", + ): + return getattr(torch, name) + elif module == "numpy.core.multiarray" and name == "scalar": + return np.core.multiarray.scalar + elif module == "numpy" and name == "dtype": + return np.dtype + elif module == "_codecs" and name == "encode": + return _codecs.encode + else: + # Forbid everything else. + qualified_name = name if module == "__builtin__" else f"{module}.{name}" + raise pickle.UnpicklingError( + f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}" + ) + + def load(self, *args, **kwargs): + self.original_persistent_load = getattr( + self, "persistent_load", pickle.Unpickler.persistent_load + ) + self.persistent_load = self.forced_persistent_load + return super().load(*args, **kwargs) + + +@contextlib.contextmanager +def use_custom_unpickler(unpickler: pickle.Unpickler = RestrictedUnpickler): + try: + old_unpickler = pickle.Unpickler + pickle.Unpickler = unpickler + + old_pickle_load = pickle.load + + def new_pickle_load(*args, **kwargs): + return pickle.Unpickler(*args, **kwargs).load() + + pickle.load = new_pickle_load + yield + + finally: + pickle.Unpickler = old_unpickler + pickle.load = old_pickle_load diff --git a/modeling/stoppers.py b/modeling/stoppers.py index 94c09e85..e4c20be6 100644 --- a/modeling/stoppers.py +++ b/modeling/stoppers.py @@ -3,15 +3,12 @@ from __future__ import annotations import torch import utils -from modeling.inference_model import ( - InferenceModel, -) - +from modeling import inference_model class Stoppers: @staticmethod def core_stopper( - model: InferenceModel, + model: inference_model.InferenceModel, input_ids: torch.LongTensor, ) -> bool: if not utils.koboldai_vars.inference_config.do_core: @@ -62,7 +59,7 @@ class Stoppers: @staticmethod def dynamic_wi_scanner( - model: InferenceModel, + model: inference_model.InferenceModel, input_ids: torch.LongTensor, ) -> bool: if not utils.koboldai_vars.inference_config.do_dynamic_wi: @@ -93,7 +90,7 @@ class Stoppers: @staticmethod def chat_mode_stopper( - model: InferenceModel, + model: inference_model.InferenceModel, input_ids: torch.LongTensor, ) -> bool: if not utils.koboldai_vars.chatmode: @@ -118,7 +115,7 @@ class Stoppers: @staticmethod def stop_sequence_stopper( - model: InferenceModel, + model: inference_model.InferenceModel, input_ids: torch.LongTensor, ) -> bool: @@ -126,7 +123,12 @@ class Stoppers: # null_character = model.tokenizer.encode(chr(0))[0] if "completed" not in model.gen_state: model.gen_state["completed"] = [False] * len(input_ids) - + if utils.koboldai_vars.adventure: + extra_options = [">", "\n>"] + for option in extra_options: + if option not in utils.koboldai_vars.stop_sequence: + utils.koboldai_vars.stop_sequence.append(option) + #one issue is that the stop sequence may not actual align with the end of token #if its a subsection of a longer token for stopper in utils.koboldai_vars.stop_sequence: @@ -140,19 +142,31 @@ class Stoppers: if all(model.gen_state["completed"]): utils.koboldai_vars.generated_tkns = utils.koboldai_vars.genamt del model.gen_state["completed"] + if utils.koboldai_vars.adventure: # Remove added adventure mode stop sequences + for option in extra_options: + if option in utils.koboldai_vars.stop_sequence: + utils.koboldai_vars.stop_sequence.remove(option) return True return False @staticmethod def singleline_stopper( - model: InferenceModel, + model: inference_model.InferenceModel, input_ids: torch.LongTensor, ) -> bool: - """If singleline mode is enabled, it's pointless to generate output beyond the first newline.""" + """Stop on occurances of newlines **if singleline is enabled**.""" + # It might be better just to do this further up the line if not utils.koboldai_vars.singleline: return False + return Stoppers.newline_stopper(model, input_ids) + @staticmethod + def newline_stopper( + model: inference_model.InferenceModel, + input_ids: torch.LongTensor, + ) -> bool: + """Stop on occurances of newlines.""" # Keep track of presence of newlines in each sequence; we cannot stop a # batch member individually, so we must wait for all of them to contain # a newline. @@ -167,3 +181,30 @@ class Stoppers: del model.gen_state["newline_in_sequence"] return True return False + + @staticmethod + def sentence_end_stopper( + model: inference_model.InferenceModel, + input_ids: torch.LongTensor, + ) -> bool: + """Stops at the end of sentences.""" + + # TODO: Make this more robust + SENTENCE_ENDS = [".", "?", "!"] + + # We need to keep track of stopping for each batch, since we can't stop + # one individually. + if "sentence_end_in_sequence" not in model.gen_state: + model.gen_state["sentence_end_sequence"] = [False] * len(input_ids) + + for sequence_idx, batch_sequence in enumerate(input_ids): + decoded = model.tokenizer.decode(batch_sequence[-1]) + for end in SENTENCE_ENDS: + if end in decoded: + model.gen_state["sentence_end_sequence"][sequence_idx] = True + break + + if all(model.gen_state["sentence_end_sequence"]): + del model.gen_state["sentence_end_sequence"] + return True + return False \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3978eda3..8dc7f9a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,7 +39,6 @@ pytest-metadata==2.0.4 requests-mock==1.10.0 safetensors==0.3.1 git+https://github.com/0cc4m/hf_bleeding_edge/ ---find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4 einops peft==0.3.0 -scipy \ No newline at end of file +scipy diff --git a/static/klite.html b/static/klite.html index 4b62dc2f..33ba94f0 100644 --- a/static/klite.html +++ b/static/klite.html @@ -3,7 +3,7 @@ -
+

Disconnected

@@ -53,13 +53,13 @@
-
+
-
+ diff --git a/templates/popups.html b/templates/popups.html index 5dfa7378..c0656a22 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -70,12 +70,12 @@
-
diff --git a/themes/tweaks/hide-welcome-logo.css b/themes/tweaks/hide-welcome-logo.css index 5e35d101..e0964397 100644 --- a/themes/tweaks/hide-welcome-logo.css +++ b/themes/tweaks/hide-welcome-logo.css @@ -1 +1 @@ -#welcome_text { display:none; pointer-events: none } +#welcome-logo { display:none; pointer-events: none } diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index 5a5271e2..c49e27da 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1116,10 +1116,11 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword thread_resources_env = maps.ResourceEnv(maps.Mesh(devices, ('dp', 'mp')), ()) maps.thread_resources.env = thread_resources_env if initial_load: - logger.message(f"KoboldAI has finished loading and is available at the following link for UI 1: {koboldai_vars.cloudflare_link}") - logger.message(f"KoboldAI has finished loading and is available at the following link for UI 2: {koboldai_vars.cloudflare_link}/new_ui") - logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite") - logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api") + logger.message(f"KoboldAI has still loading your model but available at the following link: {koboldai_vars.cloudflare_link}") + logger.message(f"KoboldAI has still loading your model but available at the following link for the Classic UI: {koboldai_vars.cloudflare_link}/classic") + logger.message(f"KoboldAI has still loading your model but available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite") + logger.message(f"KoboldAI has still loading your model but available at the following link for the API: [Loading Model...]") + logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.") global badwords # These are the tokens that we don't want the AI to ever write @@ -1302,7 +1303,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False) except Exception as e: model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): @@ -1317,7 +1318,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False) except Exception as e: model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache") else: @@ -1332,7 +1333,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False) except Exception as e: model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache")