KoboldAI-Client/modeling/inference_models/legacy_gpt2_hf.py

from __future__ import annotations

import os
import json
import traceback

from transformers import GPT2LMHeadModel, GPT2Tokenizer

import utils
from modeling.inference_models.hf_torch import HFTorchInferenceModel


class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
        utils.koboldai_vars.lazy_load = False

        model_path = None

        for possible_config_path in [
            utils.koboldai_vars.custmodpth,
            os.path.join("models", utils.koboldai_vars.custmodpth),
        ]:
            try:
                with open(
                    os.path.join(possible_config_path, "config.json"), "r"
                ) as file:
                    self.model_config = json.load(file)
                model_path = possible_config_path
                break
            except FileNotFoundError:
                pass

        if not model_path:
            raise RuntimeError("Empty model_path!")

        with self._maybe_use_float16():
            try:
                self.model = GPT2LMHeadModel.from_pretrained(
                    utils.koboldai_vars.custmodpth,
                    revision=utils.koboldai_vars.revision,
                    cache_dir="cache",
                )
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    utils.koboldai_vars.custmodpth,
                    revision=utils.koboldai_vars.revision,
                    cache_dir="cache",
                )
            except Exception as e:
                if "out of memory" in traceback.format_exc().lower():
                    raise RuntimeError(
                        "One of your GPUs ran out of memory when KoboldAI tried to load your model."
                    ) from e
                raise e

        if save_model:
            self.model.save_pretrained(
                self.get_local_model_path(ignore_existance=True),
                max_shard_size="500MiB",
            )
            self.tokenizer.save_pretrained(
                self.get_local_model_path(ignore_existance=True)
            )

        utils.koboldai_vars.modeldim = self.get_hidden_size()

        # Is CUDA available? If so, use GPU, otherwise fall back to CPU
        if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu:
            self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
        else:
            self.model = self.model.to("cpu").float()

        self.patch_causal_lm()