From fa443487e3d3910552c91eb536eeed2722c3722b Mon Sep 17 00:00:00 2001
From: somebody <onesome01@protonmail.com>
Date: Sat, 24 Sep 2022 20:46:52 -0500
Subject: [PATCH] RWKV Work

---
 .gitignore  |   5 ++
 aiserver.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 188 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 070fa833..f49f9a7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,11 @@ flask_session
 accelerate-disk-cache
 .ipynb_checkpoints
 
+# Temporary until HF port
+!models/RWKV-v4
+models/RWKV-v4/20B_tokenizer.json
+models/RWKV-v4/models
+
 # Ignore PyCharm project files.
 .idea
 
diff --git a/aiserver.py b/aiserver.py
index e2dd6905..7083dd74 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -140,6 +140,7 @@ model_menu = {
         ["Untuned Fairseq Dense", "fsdlist", "", True],
         ["Untuned Bloom", "bloomlist", "", True],
         ["Untuned XGLM", "xglmlist", "", True],
+        ["Untuned RWKV-4", "rwkvlist", "", True],
         ["Untuned GPT2", "gpt2list", "", True],
         ["Online Services", "apilist", "", True],
         ["Read Only (No AI)", "ReadOnly", "", False]
@@ -244,6 +245,19 @@ model_menu = {
         ["XGLM 564M", "facebook/xglm-564M", "4GB", False],
         ["Return to Main Menu", "mainmenu", "", True],
         ],
+    'rwkvlist': [
+        ["RWKV-4 7B (GPU)", "RWKV-7B-GPU", "??GB", False],
+        ["RWKV-4 7B (CPU)", "RWKV-7B-CPU", "??GB", False],
+        ["RWKV-4 3B (GPU)", "RWKV-3B-GPU", "?GB", False],
+        ["RWKV-4 3B (CPU)", "RWKV-3B-CPU", "?GB", False],
+        ["RWKV-4 1.5B (GPU)", "RWKV-1B5-GPU", "9GB", False],
+        ["RWKV-4 1.5B (CPU)", "RWKV-1B5-CPU", "6GB", False],
+        ["RWKV-4 340M (GPU)", "RWKV-340M-GPU", "?GB", False],
+        ["RWKV-4 340M (CPU)", "RWKV-340M-CPU", "?GB", False],
+        ["RWKV-4 169M (GPU)", "RWKV-169M-GPU", "?GB", False],
+        ["RWKV-4 169M (CPU)", "RWKV-169M-CPU", "?GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
     'apilist': [
         ["GooseAI API (requires API key)", "GooseAI", "", False],
         ["OpenAI API (requires API key)", "OAI", "", False],
@@ -1464,6 +1478,8 @@ def get_model_info(model, directory=""):
                     print(":(")
                     pass
         key = True
+    elif model.startswith("RWKV"):
+        pass
     elif model == 'ReadOnly':
         pass
     elif not utils.HAS_ACCELERATE and not torch.cuda.is_available():
@@ -2351,7 +2367,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
     
     
     # If transformers model was selected & GPU available, ask to use CPU or GPU
-    if(koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"] and not koboldai_vars.model.startswith("RWKV")):
         koboldai_vars.allowsp = True
         # Test for GPU support
         
@@ -2443,7 +2459,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
         koboldai_vars.noai = True
 
     # Start transformers and create pipeline
-    if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if koboldai_vars.model.startswith("RWKV"):
+        _, model_class, device = koboldai_vars.model.split("-")
+        model, tokenizer = rwkv_init(
+            model_class=model_class,
+            use_gpu=(device == "GPU")
+        )
+
+        global breakmodel
+        import breakmodel
+    elif (not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
         if(not koboldai_vars.noai):
             logger.init("Transformers", status='Starting')
             for m in ("GPTJModel", "XGLMModel"):
@@ -5001,7 +5026,13 @@ def core_generate(text: list, min: int, max: int, found_entries: set):
             genout = result.encoded
 
             already_generated += len(genout[0]) - 1
-            assert already_generated <= koboldai_vars.genamt
+
+            try:
+                assert already_generated <= koboldai_vars.genamt
+            except AssertionError:
+                print("AlreadyGenerated", already_generated)
+                print("genamt", koboldai_vars.genamt)
+                raise
 
             if result.is_whole_generation:
                 break
@@ -5165,6 +5196,16 @@ def raw_generate(
         return GenerationResult(
             out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True
         )
+    elif koboldai_vars.model.startswith("RWKV"):
+        batch_encoded = rwkv_raw_generate(
+            prompt_tokens=prompt_tokens,
+            max_new=max_new,
+            batch_count=batch_count,
+            gen_settings=gen_settings
+        )
+        return GenerationResult(
+            out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True, output_includes_prompt=True
+        )
 
     # Torch HF
     batch_encoded = torch_raw_generate(
@@ -5555,6 +5596,145 @@ def api_raw_generate(
         genout = [obj["text"] for obj in js["results"]]
         return np.array([tokenizer.encode(x) for x in genout])
 
+def rwkv_raw_generate(
+    prompt_tokens: List[int],
+    max_new: int,
+    batch_count: int,
+    gen_settings: GenerationSettings,
+):
+    import types
+
+    model.clear()
+    context = list(prompt_tokens)
+
+    input_length = len(prompt_tokens)
+
+    # TODO: Not needed every run? I think this is creating that huge wait time
+    # between generations.
+    init_state = types.SimpleNamespace()
+    for i in range(input_length):
+        x = context[:i+1]
+        if i == input_length - 1:
+            init_state.out = model.run(x)
+        else:
+            model.run(x)
+    model.save(init_state)
+
+    for ni, i in enumerate(range(input_length, input_length + max_new)):
+        x = context[:i+1]
+        x = x[-model.ctx_len:]
+
+        if i == input_length:
+            out = copy.deepcopy(init_state.out)
+        else:
+            out = model.run(x)
+        
+        # Don't generate EOS
+        out[0] = -9999999
+
+        char = tokenizer.sample_logits(
+            out=out,
+            x=x,
+            ctx_len=model.ctx_len,
+            temperature=gen_settings.temp,
+            top_p=gen_settings.top_p,
+        )
+        char = char.item()
+        context.append(char)
+
+        if koboldai_vars.output_streaming:
+            koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(char))])
+
+        # HACK
+        if ni > max_new:
+            break
+    
+    return np.array([context])
+
+
+@dataclass
+class RWKVConfig:
+    n_layer: int
+    n_embed: int
+    ctx_len: int
+
+def rwkv_init(model_class: str, use_gpu: bool = False):
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    os.environ["RWKV_FLOAT_MODE"] = "bf16"
+
+    logger.info("[RWKV] RWKV support is in super-duper-uber-schmoober alpha and will ignore many options.")
+
+    device = "cpu"
+
+    if use_gpu:
+        logger.warning("[RWKV] Using GPU. This may not work out of the box and may require significant setup.")
+        device = "cuda"
+    
+    os.environ["RWKV_RUN_DEVICE"] = device
+
+    TOKENIZER_PATH = "models/RWKV4/20B_tokenizer.json"
+    MODEL_DIR = "models/RWKV4/models"
+
+    model_files = os.listdir(MODEL_DIR)
+    matching_models = [f for f in model_files if f.startswith(f"RWKV-4-Pile-{model_class}")]
+    if not matching_models:
+        raise RuntimeError(f"No models of class '{model_class}' found in '{MODEL_DIR}'. Did you rename the model?")
+    model_path = os.path.join(MODEL_DIR, sorted(matching_models)[-1])
+
+    model_config = {
+        "169M": RWKVConfig(n_layer=12, n_embed=768, ctx_len=1024),
+        "430M": RWKVConfig(n_layer=24, n_embed=1024, ctx_len=1024),
+        "1B5": RWKVConfig(n_layer=24, n_embed=2048, ctx_len=1024),
+        "3B": RWKVConfig(n_layer=32, n_embed=2560, ctx_len=1024),
+        "7B": RWKVConfig(n_layer=32, n_embed=4096, ctx_len=1024),
+    }.get(model_class)
+
+    if not model_config:
+        raise RuntimeError(f"No config for model '{model_class}' found!")
+    
+    if not os.path.exists(TOKENIZER_PATH):
+        raise RuntimeError(f"Can't find tokenizer at '{TOKENIZER_PATH}'. Did you download it and put it at that location?")
+    
+    # Model stuff
+    from models.RWKV4.src.model_run import RWKV_RNN
+    from transformers import PreTrainedTokenizerFast
+    from torch.nn import functional as F
+
+    model = RWKV_RNN(
+        model_path.split(".")[0],
+        device,
+        "RWKV",
+        model_config.n_layer,
+        model_config.n_embed,
+        model_config.ctx_len,
+    )
+    tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_PATH)
+
+    # We'll just patch tokenizer ourselves to make it easier
+    def _sample_logits(self, out, x, ctx_len, temperature, top_p):
+        last_char = int(x[-1])
+        probs = F.softmax(torch.tensor(out), dim=-1)
+        sorted_probs, s_index = torch.sort(probs, descending=True)
+
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()
+        cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])
+
+        probs[probs < cutoff] = 0
+
+        if temperature != 1.0:
+            probs = probs.pow(1.0 / temperature)
+        
+        return torch.multinomial(probs, num_samples=1)[0]
+    
+    tokenizer.sample_logits = _sample_logits.__get__(tokenizer, AutoTokenizer)
+
+    tokenizer._koboldai_header = []
+    tokenizer.add_bos_token = False
+    tokenizer.add_prefix_space = False
+
+    logger.info("[RWKV] Loaded :^)")
+    return model, tokenizer
 
 #==================================================================#
 # Send text to generator and deal with output