Model: Fix TPU

2025-06-05 21:59:24 +02:00 · 2023-03-01 19:40:52 -06:00
parent f2974d205e
commit 27b7635c95
8 changed files with 65 additions and 21 deletions
--- a/modeling/inference_model.py
+++ b/modeling/inference_model.py
@@ -330,6 +330,11 @@ class InferenceModel:
                        # Real max length is handled by CoreStopper.
                        bypass_hf_maxlength=utils.koboldai_vars.dynamicscan,
                        is_core=True,
+                        tpu_dynamic_inference=utils.koboldai_vars.dynamicscan
+                        or (
+                            not utils.koboldai_vars.nogenmod
+                            and utils.koboldai_vars.has_genmod
+                        ),
                    )
                    logger.debug(
                        "core_generate: run raw_generate pass {} {}s".format(
@@ -473,6 +478,7 @@ class InferenceModel:
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs,
    ) -> GenerationResult:
        """Lowest level model-agnostic generation function. To be overridden by model implementation.

@@ -501,6 +507,8 @@ class InferenceModel:
        is_core: bool = False,
        single_line: bool = False,
        found_entries: set = (),
+        tpu_dynamic_inference: bool = False,
+        **kwargs,
    ) -> GenerationResult:
        """A wrapper around `_raw_generate()` that handles gen_state and other stuff. Use this to generate text outside of the story.

@@ -563,6 +571,7 @@ class InferenceModel:
                batch_count=batch_count,
                gen_settings=gen_settings,
                single_line=single_line,
+                tpu_dynamic_inference=tpu_dynamic_inference,
            )

        time_end = round(time.time() - time_start, 2)
--- a/modeling/inference_models/api.py
+++ b/modeling/inference_models/api.py
@@ -35,6 +35,7 @@ class APIInferenceModel(InferenceModel):
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs
    ):
        decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))

--- a/modeling/inference_models/colab.py
+++ b/modeling/inference_models/colab.py
@@ -29,6 +29,7 @@ class ColabInferenceModel(InferenceModel):
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs
    ):
        decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))

--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -9,10 +9,16 @@ from typing import Union
 from transformers import AutoModelForCausalLM, GPTNeoForCausalLM

 import utils
-import breakmodel
 import torch_lazy_loader
 import koboldai_settings

+try:
+    import breakmodel
+except ModuleNotFoundError as e:
+    # Breakmodel is only expected to work on GPU
+    if not utils.koboldai_vars.use_colab_tpu:
+        raise e
+
 from modeling.inference_models.hf_torch import HFTorchInferenceModel


--- a/modeling/inference_models/hf_mtj.py
+++ b/modeling/inference_models/hf_mtj.py
@@ -10,7 +10,11 @@ import utils
 import koboldai_settings
 from logger import logger, Colors

-from modeling.inference_model import ModelCapabilities
+from modeling.inference_model import (
+    GenerationResult,
+    GenerationSettings,
+    ModelCapabilities,
+)
 from modeling.inference_models.hf import HFInferenceModel

 try:
@@ -257,9 +261,14 @@ class HFMTJInferenceModel(HFInferenceModel):
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs
    ) -> GenerationResult:
        soft_tokens = self.get_soft_tokens()

+        dynamic_inference = kwargs.get("tpu_dynamic_inference", False)
+        print(f"DYNAMIC_INFERENCE={dynamic_inference} KWARGS={kwargs}")
+
+        if not dynamic_inference:
            genout = tpool.execute(
                tpu_mtj_backend.infer_static,
                np.uint32(prompt_tokens),
@@ -279,6 +288,21 @@ class HFMTJInferenceModel(HFInferenceModel):
                sampler_order=gen_settings.sampler_order,
            )
            genout = np.array(genout)
+        else:
+            genout = tpool.execute(
+                tpu_mtj_backend.infer_dynamic,
+                context=np.uint32(prompt_tokens),
+                numseqs=batch_count,
+                gen_len=max_new,
+                soft_embeddings=utils.koboldai_vars.sp,
+                soft_tokens=soft_tokens,
+                # TODO: Fix Dynamic WI on TPU
+                excluded_world_info=set(),
+                use_callback=True
+            )
+            print(genout)
+            print(type(genout))
+            genout = np.array(genout)

        return GenerationResult(
            self,
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -448,6 +448,7 @@ class HFTorchInferenceModel(HFInferenceModel):
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs
    ) -> GenerationResult:
        if not isinstance(prompt_tokens, torch.Tensor):
            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
--- a/modeling/inference_models/horde.py
+++ b/modeling/inference_models/horde.py
@@ -35,6 +35,7 @@ class HordeInferenceModel(InferenceModel):
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs
    ) -> GenerationResult:
        decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))

--- a/modeling/inference_models/openai.py
+++ b/modeling/inference_models/openai.py
@@ -28,6 +28,7 @@ class OpenAIAPIInferenceModel(InferenceModel):
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
+        **kwargs
    ) -> GenerationResult:
        # Taken mainly from oairequest()