From fa443487e3d3910552c91eb536eeed2722c3722b Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 24 Sep 2022 20:46:52 -0500 Subject: [PATCH] RWKV Work --- .gitignore | 5 ++ aiserver.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 188 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 070fa833..f49f9a7b 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,11 @@ flask_session accelerate-disk-cache .ipynb_checkpoints +# Temporary until HF port +!models/RWKV-v4 +models/RWKV-v4/20B_tokenizer.json +models/RWKV-v4/models + # Ignore PyCharm project files. .idea diff --git a/aiserver.py b/aiserver.py index e2dd6905..7083dd74 100644 --- a/aiserver.py +++ b/aiserver.py @@ -140,6 +140,7 @@ model_menu = { ["Untuned Fairseq Dense", "fsdlist", "", True], ["Untuned Bloom", "bloomlist", "", True], ["Untuned XGLM", "xglmlist", "", True], + ["Untuned RWKV-4", "rwkvlist", "", True], ["Untuned GPT2", "gpt2list", "", True], ["Online Services", "apilist", "", True], ["Read Only (No AI)", "ReadOnly", "", False] @@ -244,6 +245,19 @@ model_menu = { ["XGLM 564M", "facebook/xglm-564M", "4GB", False], ["Return to Main Menu", "mainmenu", "", True], ], + 'rwkvlist': [ + ["RWKV-4 7B (GPU)", "RWKV-7B-GPU", "??GB", False], + ["RWKV-4 7B (CPU)", "RWKV-7B-CPU", "??GB", False], + ["RWKV-4 3B (GPU)", "RWKV-3B-GPU", "?GB", False], + ["RWKV-4 3B (CPU)", "RWKV-3B-CPU", "?GB", False], + ["RWKV-4 1.5B (GPU)", "RWKV-1B5-GPU", "9GB", False], + ["RWKV-4 1.5B (CPU)", "RWKV-1B5-CPU", "6GB", False], + ["RWKV-4 340M (GPU)", "RWKV-340M-GPU", "?GB", False], + ["RWKV-4 340M (CPU)", "RWKV-340M-CPU", "?GB", False], + ["RWKV-4 169M (GPU)", "RWKV-169M-GPU", "?GB", False], + ["RWKV-4 169M (CPU)", "RWKV-169M-CPU", "?GB", False], + ["Return to Main Menu", "mainmenu", "", True], + ], 'apilist': [ ["GooseAI API (requires API key)", "GooseAI", "", False], ["OpenAI API (requires API key)", "OAI", "", False], @@ -1464,6 +1478,8 @@ def get_model_info(model, directory=""): print(":(") pass key = True + elif model.startswith("RWKV"): + pass elif model == 'ReadOnly': pass elif not utils.HAS_ACCELERATE and not torch.cuda.is_available(): @@ -2351,7 +2367,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If transformers model was selected & GPU available, ask to use CPU or GPU - if(koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"] and not koboldai_vars.model.startswith("RWKV")): koboldai_vars.allowsp = True # Test for GPU support @@ -2443,7 +2459,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.noai = True # Start transformers and create pipeline - if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if koboldai_vars.model.startswith("RWKV"): + _, model_class, device = koboldai_vars.model.split("-") + model, tokenizer = rwkv_init( + model_class=model_class, + use_gpu=(device == "GPU") + ) + + global breakmodel + import breakmodel + elif (not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): if(not koboldai_vars.noai): logger.init("Transformers", status='Starting') for m in ("GPTJModel", "XGLMModel"): @@ -5001,7 +5026,13 @@ def core_generate(text: list, min: int, max: int, found_entries: set): genout = result.encoded already_generated += len(genout[0]) - 1 - assert already_generated <= koboldai_vars.genamt + + try: + assert already_generated <= koboldai_vars.genamt + except AssertionError: + print("AlreadyGenerated", already_generated) + print("genamt", koboldai_vars.genamt) + raise if result.is_whole_generation: break @@ -5165,6 +5196,16 @@ def raw_generate( return GenerationResult( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) + elif koboldai_vars.model.startswith("RWKV"): + batch_encoded = rwkv_raw_generate( + prompt_tokens=prompt_tokens, + max_new=max_new, + batch_count=batch_count, + gen_settings=gen_settings + ) + return GenerationResult( + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True, output_includes_prompt=True + ) # Torch HF batch_encoded = torch_raw_generate( @@ -5555,6 +5596,145 @@ def api_raw_generate( genout = [obj["text"] for obj in js["results"]] return np.array([tokenizer.encode(x) for x in genout]) +def rwkv_raw_generate( + prompt_tokens: List[int], + max_new: int, + batch_count: int, + gen_settings: GenerationSettings, +): + import types + + model.clear() + context = list(prompt_tokens) + + input_length = len(prompt_tokens) + + # TODO: Not needed every run? I think this is creating that huge wait time + # between generations. + init_state = types.SimpleNamespace() + for i in range(input_length): + x = context[:i+1] + if i == input_length - 1: + init_state.out = model.run(x) + else: + model.run(x) + model.save(init_state) + + for ni, i in enumerate(range(input_length, input_length + max_new)): + x = context[:i+1] + x = x[-model.ctx_len:] + + if i == input_length: + out = copy.deepcopy(init_state.out) + else: + out = model.run(x) + + # Don't generate EOS + out[0] = -9999999 + + char = tokenizer.sample_logits( + out=out, + x=x, + ctx_len=model.ctx_len, + temperature=gen_settings.temp, + top_p=gen_settings.top_p, + ) + char = char.item() + context.append(char) + + if koboldai_vars.output_streaming: + koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(char))]) + + # HACK + if ni > max_new: + break + + return np.array([context]) + + +@dataclass +class RWKVConfig: + n_layer: int + n_embed: int + ctx_len: int + +def rwkv_init(model_class: str, use_gpu: bool = False): + torch.backends.cudnn.allow_tf32 = True + torch.backends.cuda.matmul.allow_tf32 = True + os.environ["RWKV_FLOAT_MODE"] = "bf16" + + logger.info("[RWKV] RWKV support is in super-duper-uber-schmoober alpha and will ignore many options.") + + device = "cpu" + + if use_gpu: + logger.warning("[RWKV] Using GPU. This may not work out of the box and may require significant setup.") + device = "cuda" + + os.environ["RWKV_RUN_DEVICE"] = device + + TOKENIZER_PATH = "models/RWKV4/20B_tokenizer.json" + MODEL_DIR = "models/RWKV4/models" + + model_files = os.listdir(MODEL_DIR) + matching_models = [f for f in model_files if f.startswith(f"RWKV-4-Pile-{model_class}")] + if not matching_models: + raise RuntimeError(f"No models of class '{model_class}' found in '{MODEL_DIR}'. Did you rename the model?") + model_path = os.path.join(MODEL_DIR, sorted(matching_models)[-1]) + + model_config = { + "169M": RWKVConfig(n_layer=12, n_embed=768, ctx_len=1024), + "430M": RWKVConfig(n_layer=24, n_embed=1024, ctx_len=1024), + "1B5": RWKVConfig(n_layer=24, n_embed=2048, ctx_len=1024), + "3B": RWKVConfig(n_layer=32, n_embed=2560, ctx_len=1024), + "7B": RWKVConfig(n_layer=32, n_embed=4096, ctx_len=1024), + }.get(model_class) + + if not model_config: + raise RuntimeError(f"No config for model '{model_class}' found!") + + if not os.path.exists(TOKENIZER_PATH): + raise RuntimeError(f"Can't find tokenizer at '{TOKENIZER_PATH}'. Did you download it and put it at that location?") + + # Model stuff + from models.RWKV4.src.model_run import RWKV_RNN + from transformers import PreTrainedTokenizerFast + from torch.nn import functional as F + + model = RWKV_RNN( + model_path.split(".")[0], + device, + "RWKV", + model_config.n_layer, + model_config.n_embed, + model_config.ctx_len, + ) + tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_PATH) + + # We'll just patch tokenizer ourselves to make it easier + def _sample_logits(self, out, x, ctx_len, temperature, top_p): + last_char = int(x[-1]) + probs = F.softmax(torch.tensor(out), dim=-1) + sorted_probs, s_index = torch.sort(probs, descending=True) + + cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy() + cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)]) + + probs[probs < cutoff] = 0 + + if temperature != 1.0: + probs = probs.pow(1.0 / temperature) + + return torch.multinomial(probs, num_samples=1)[0] + + tokenizer.sample_logits = _sample_logits.__get__(tokenizer, AutoTokenizer) + + tokenizer._koboldai_header = [] + tokenizer.add_bos_token = False + tokenizer.add_prefix_space = False + + logger.info("[RWKV] Loaded :^)") + return model, tokenizer #==================================================================# # Send text to generator and deal with output