diff --git a/aiserver.py b/aiserver.py index fc0bfcda..82e9a3f5 100644 --- a/aiserver.py +++ b/aiserver.py @@ -784,6 +784,7 @@ parser.add_argument("--host", action='store_true', help="Optimizes KoboldAI for parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable") parser.add_argument("--model", help="Specify the Model Type to skip the Menu") parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)") +parser.add_argument("--revision", help="Specify the model revision for huggingface models (can be a git branch/tag name or a git commit hash)") parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.") parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS) parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS) @@ -795,6 +796,7 @@ parser.add_argument("--colab", action='store_true', help="Optimize for Google Co parser.add_argument("--nobreakmodel", action='store_true', help="Disables Breakmodel support completely.") parser.add_argument("--unblock", action='store_true', default=False, help="Unblocks the KoboldAI port to be accessible from other machines without optimizing for remote play (It is recommended to use --host instead)") parser.add_argument("--quiet", action='store_true', default=False, help="If present will suppress any story related text from showing on the console") +parser.add_argument("--no_aria2", action='store_true', default=False, help="Prevents KoboldAI from using aria2 to download huggingface models more efficiently, in case aria2 is causing you issues") parser.add_argument("--lowmem", action='store_true', help="Extra Low Memory loading for the GPU, slower but memory does not peak to twice the usage") parser.add_argument("--savemodel", action='store_true', help="Saves the model to the models folder even if --colab is used (Allows you to save models to Google Drive)") args: argparse.Namespace = None @@ -805,6 +807,7 @@ else: args = parser.parse_args() vars.model = args.model; +vars.revision = args.revision if args.colab: args.remote = True; @@ -867,19 +870,19 @@ if(vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMe from transformers import AutoConfig if(os.path.isdir(vars.custmodpth.replace('/', '_'))): try: - model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), cache_dir="cache/") + model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), revision=vars.revision, cache_dir="cache") vars.model_type = model_config.model_type except ValueError as e: vars.model_type = "not_found" elif(os.path.isdir("models/{}".format(vars.custmodpth.replace('/', '_')))): try: - model_config = AutoConfig.from_pretrained("models/{}".format(vars.custmodpth.replace('/', '_')), cache_dir="cache/") + model_config = AutoConfig.from_pretrained("models/{}".format(vars.custmodpth.replace('/', '_')), revision=vars.revision, cache_dir="cache") vars.model_type = model_config.model_type except ValueError as e: vars.model_type = "not_found" else: try: - model_config = AutoConfig.from_pretrained(vars.custmodpth, cache_dir="cache/") + model_config = AutoConfig.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") vars.model_type = model_config.model_type except ValueError as e: vars.model_type = "not_found" @@ -1111,6 +1114,15 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go import transformers.generation_utils from transformers import __version__ as transformers_version + from transformers import PreTrainedModel + old_from_pretrained = PreTrainedModel.from_pretrained.__func__ + @classmethod + def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + if not args.no_aria2: + utils.aria2_hook(pretrained_model_name_or_path, **kwargs) + return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) + PreTrainedModel.from_pretrained = new_from_pretrained + # Lazy loader import torch_lazy_loader def get_lazy_load_callback(n_layers, convert_to_float16=True): @@ -1425,8 +1437,8 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go model_config = open(vars.custmodpth + "/config.json", "r") js = json.load(model_config) with(maybe_use_float16()): - model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/") - tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache/") + model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") vars.modeldim = get_hidden_size_from_model(model) # Is CUDA available? If so, use GPU, otherwise fall back to CPU if(vars.hascuda and vars.usegpu): @@ -1461,45 +1473,45 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go lowmem = {} if(os.path.isdir(vars.custmodpth)): try: - tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, cache_dir="cache") + tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") except Exception as e: try: - tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") except Exception as e: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem) except Exception as e: - model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem) elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))): try: - tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") except Exception as e: try: - tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") except Exception as e: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem) except Exception as e: - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem) else: try: - tokenizer = AutoTokenizer.from_pretrained(vars.model, cache_dir="cache") + tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") except Exception as e: try: - tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") except Exception as e: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem) except Exception as e: - model = GPTNeoForCausalLM.from_pretrained(vars.model, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem) if not args.colab or args.savemodel: import shutil model = model.half() - model.save_pretrained("models/{}".format(vars.model.replace('/', '_'))) + model.save_pretrained("models/{}".format(vars.model.replace('/', '_')), max_shard_size="500MiB") tokenizer.save_pretrained("models/{}".format(vars.model.replace('/', '_'))) shutil.rmtree("cache/") @@ -1533,8 +1545,17 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go else: from transformers import GPT2TokenizerFast - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") else: + from transformers import PreTrainedModel + old_from_pretrained = PreTrainedModel.from_pretrained.__func__ + @classmethod + def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + if not args.no_aria2: + utils.aria2_hook(pretrained_model_name_or_path, **kwargs) + return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) + PreTrainedModel.from_pretrained = new_from_pretrained + def tpumtjgetsofttokens(): soft_tokens = None if(vars.sp is None): @@ -1623,11 +1644,11 @@ else: # If we're running Colab or OAI, we still need a tokenizer. if(vars.model == "Colab"): from transformers import GPT2TokenizerFast - tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache") loadsettings() elif(vars.model == "OAI"): from transformers import GPT2TokenizerFast - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") loadsettings() # Load the TPU backend if requested elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): @@ -1813,7 +1834,7 @@ def lua_decode(tokens): if("tokenizer" not in globals()): from transformers import GPT2TokenizerFast global tokenizer - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") return utils.decodenewlines(tokenizer.decode(tokens)) #==================================================================# @@ -1825,7 +1846,7 @@ def lua_encode(string): if("tokenizer" not in globals()): from transformers import GPT2TokenizerFast global tokenizer - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") return tokenizer.encode(utils.encodenewlines(string), max_length=int(4e9), truncation=True) #==================================================================# @@ -3081,7 +3102,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None, if("tokenizer" not in globals()): from transformers import GPT2TokenizerFast global tokenizer - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") # Calculate token budget prompttkns = tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', vars.prompt)), max_length=int(2e9), truncation=True) diff --git a/colabkobold.sh b/colabkobold.sh index 1764a166..4c41675a 100644 --- a/colabkobold.sh +++ b/colabkobold.sh @@ -162,7 +162,7 @@ if [ "$init" != "skip" ]; then fi # Make sure Colab has the system dependencies - sudo apt install netbase -y + sudo apt install netbase aria2 -y npm install -g localtunnel fi @@ -186,8 +186,7 @@ fi #Download routine for Aria2c scripts if [ ! -z ${aria2+x} ]; then - apt install aria2 -y - curl -L $aria2 | aria2c -c -i- -d$dloc --user-agent=KoboldAI --file-allocation=none + curl -L $aria2 | aria2c -x 10 -s 10 -j 10 -c -i- -d$dloc --user-agent=KoboldAI --file-allocation=none fi #Extract the model with 7z diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index 6d1b8552..1f67763f 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1251,39 +1251,39 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo with torch_lazy_loader.use_lazy_torch_load(callback=callback, dematerialized_modules=True): if(os.path.isdir(vars.custmodpth)): try: - tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, cache_dir="cache") + tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") except Exception as e: try: - tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") except Exception as e: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") except Exception as e: - model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache") + model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))): try: - tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") except Exception as e: try: - tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") except Exception as e: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") except Exception as e: - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") else: try: - tokenizer = AutoTokenizer.from_pretrained(vars.model, cache_dir="cache") + tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") except Exception as e: try: - tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") except Exception as e: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache") + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") except Exception as e: - model = GPTNeoForCausalLM.from_pretrained(vars.model, cache_dir="cache") + model = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") #network.state = network.move_xmap(network.state, np.zeros(cores_per_replica)) diff --git a/utils.py b/utils.py index 1c44c27f..38066ed0 100644 --- a/utils.py +++ b/utils.py @@ -1,5 +1,11 @@ from threading import Timer import re +import shutil +import json +import subprocess +import tempfile +import requests +import os vars = None @@ -125,3 +131,83 @@ def decodenewlines(txt): if(vars.newlinemode == "s"): return txt.replace("", '\n') return txt + +#==================================================================# +# Downloads sharded huggingface checkpoints using aria2c if possible +#==================================================================# +def aria2_hook(pretrained_model_name_or_path: str, force_download=False, cache_dir=None, proxies=None, resume_download=False, local_files_only=False, use_auth_token=None, user_agent=None, revision=None, mirror=None, **kwargs): + import transformers + import transformers.modeling_utils + from huggingface_hub import HfFolder + if shutil.which("aria2c") is None: # Don't do anything if aria2 is not installed + return + if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path) or transformers.modeling_utils.is_remote_url(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path + ".index"): + return + if proxies: + print("WARNING: KoboldAI does not support using aria2 to download models from huggingface.co through a proxy. Disabling aria2 download mode.") + return + if use_auth_token: + if isinstance(use_auth_token, str): + token = use_auth_token + else: + token = HfFolder.get_token() + if token is None: + raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.") + _cache_dir = str(cache_dir) if cache_dir is not None else transformers.TRANSFORMERS_CACHE + sharded = False + headers = {"user-agent": transformers.file_utils.http_user_agent(user_agent)} + if use_auth_token: + headers["authorization"] = f"Bearer {use_auth_token}" + def is_cached(url): + try: + transformers.file_utils.get_from_cache(url, cache_dir=cache_dir, local_files_only=True) + except FileNotFoundError: + return False + return True + while True: # Try to get the huggingface.co URL of the model's pytorch_model.bin or pytorch_model.bin.index.json file + try: + filename = transformers.modeling_utils.WEIGHTS_INDEX_NAME if sharded else transformers.modeling_utils.WEIGHTS_NAME + except AttributeError: + return + url = transformers.file_utils.hf_bucket_url(pretrained_model_name_or_path, filename, revision=revision, mirror=mirror) + if is_cached(url) or requests.head(url, allow_redirects=True, proxies=proxies, headers=headers): + break + if sharded: + return + else: + sharded = True + if not sharded: # If the model has a pytorch_model.bin file, that's the only file to download + filenames = [transformers.modeling_utils.WEIGHTS_NAME] + else: # Otherwise download the pytorch_model.bin.index.json and then let aria2 download all the pytorch_model-#####-of-#####.bin files mentioned inside it + map_filename = transformers.file_utils.cached_path(url, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, user_agent=user_agent) + with open(map_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + urls = [transformers.file_utils.hf_bucket_url(pretrained_model_name_or_path, n, revision=revision, mirror=mirror) for n in filenames] + if not force_download: + if all(is_cached(u) for u in urls): + return + elif local_files_only: + raise FileNotFoundError("Cannot find the requested files in the cached path and outgoing traffic has been disabled. To enable model look-ups and downloads online, set 'local_files_only' to False.") + etags = [h.get("X-Linked-Etag") or h.get("ETag") for u in urls for h in [requests.head(u, headers=headers, allow_redirects=False, proxies=proxies, timeout=10).headers]] + filenames = [transformers.file_utils.url_to_filename(u, t) for u, t in zip(urls, etags)] + if force_download: + for n in filenames: + path = os.path.join(_cache_dir, n + ".json") + if os.path.exists(path): + os.remove(path) + aria2_config = "\n".join(f"{u}\n out={n}" for u, n in zip(urls, filenames)).encode() + with tempfile.NamedTemporaryFile("w+b", delete=False) as f: + f.write(aria2_config) + f.flush() + p = subprocess.Popen(["aria2c", "-x", "10", "-s", "10", "-j", "10", "--disable-ipv6", "--file-allocation=none", "-d", _cache_dir, "-i", f.name, "-U", transformers.file_utils.http_user_agent(user_agent)] + (["-c"] if not force_download else []) + ([f"--header='Authorization: Bearer {token}'"] if use_auth_token else []), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + for line in p.stdout: + print(line.decode(), end="", flush=True) + path = f.name + try: + os.remove(path) + except OSError: + pass + for u, t, n in zip(urls, etags, filenames): + with open(os.path.join(_cache_dir, n + ".json"), "w") as f: + json.dump({"url": u, "etag": t}, f)