prompt_tuner.py now uses lazy loader and accelerate

2025-06-05 21:59:24 +02:00 · 2022-08-22 19:29:20 -04:00
parent 584056b6d5
commit 3d5c83fc23
3 changed files with 424 additions and 27 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -429,6 +429,7 @@ def emit(*args, **kwargs):
        return _emit(*args, **kwargs)
    except AttributeError:
        return socketio.emit(*args, **kwargs)
+utils.emit = emit

 # marshmallow/apispec setup
 from apispec import APISpec
@@ -1311,6 +1312,8 @@ def general_startup(override_args=None):
        args = parser.parse_args(shlex.split(os.environ["KOBOLDAI_ARGS"]))
    else:
        args = parser.parse_args()
+    
+    utils.args = args

    if args.customsettings:
        f = open (args.customsettings)
@@ -1648,7 +1651,9 @@ def patch_transformers():
        if not args.no_aria2:
            utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
        return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
-    PreTrainedModel.from_pretrained = new_from_pretrained
+    if(not hasattr(PreTrainedModel, "_kai_patched")):
+        PreTrainedModel.from_pretrained = new_from_pretrained
+        PreTrainedModel._kai_patched = True
    if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
        old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
        def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
@@ -2490,7 +2495,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
            if not args.no_aria2:
                utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
            return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
-        PreTrainedModel.from_pretrained = new_from_pretrained
+        if(not hasattr(PreTrainedModel, "_kai_patched")):
+            PreTrainedModel.from_pretrained = new_from_pretrained
+            PreTrainedModel._kai_patched = True
        if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
            old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
            def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
--- a/prompt_tuner.py
+++ b/prompt_tuner.py
@@ -15,15 +15,249 @@ import base64
 import pickle
 import hashlib
 import itertools
+import functools
+import bisect
+import eventlet
+import packaging
+import gc
+import time
 from tqdm.auto import tqdm
 import torch
 import torch.nn.functional as F
 from torch.nn import Embedding, CrossEntropyLoss
 import transformers
-from transformers import AutoTokenizer, GPT2TokenizerFast, AutoConfig
+from transformers import __version__ as transformers_version
+from transformers import AutoTokenizer, GPT2TokenizerFast, AutoConfig, AutoModelForCausalLM, GPTNeoForCausalLM, PreTrainedModel, modeling_utils, GPTNeoModel, GPTJModel
+import accelerate
+import accelerate.utils
 from mkultra.tuning import GPTPromptTuningMixin, GPTNeoPromptTuningLM
 from mkultra.soft_prompt import SoftPrompt
-from typing import List, Optional, TextIO, Union
+from typing import Dict, List, Optional, TextIO, Union
+
+try:
+    from transformers import XGLMModel
+except:
+    pass
+try:
+    from transformers.models.opt.modeling_opt import OPTDecoder
+except:
+    pass
+
+import breakmodel
+import torch_lazy_loader
+import utils
+
+USE_BREAKMODEL = True
+
+
+class Send_to_socketio(object):
+    def write(self, bar):
+        print(bar, end="")
+        time.sleep(0.01)
+        try:
+            if utils.emit is not None:
+                utils.emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", "&nbsp;")}, broadcast=True)
+        except:
+            pass
+
+def patch_transformers_download():
+    global transformers
+    import copy, requests, tqdm, time
+    class Send_to_socketio(object):
+        def write(self, bar):
+            bar = bar.replace("\r", "").replace("\n", "")
+            if bar != "":
+                try:
+                    print(bar, end="\r")
+                    if utils.emit is not None:
+                        utils.emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", "&nbsp;")}, broadcast=True)
+                    eventlet.sleep(seconds=0)
+                except:
+                    pass
+    def http_get(
+        url: str,
+        temp_file: transformers.utils.hub.BinaryIO,
+        proxies=None,
+        resume_size=0,
+        headers: transformers.utils.hub.Optional[transformers.utils.hub.Dict[str, str]] = None,
+        file_name: transformers.utils.hub.Optional[str] = None,
+    ):
+        """
+        Download remote file. Do not gobble up errors.
+        """
+        headers = copy.deepcopy(headers)
+        if resume_size > 0:
+            headers["Range"] = f"bytes={resume_size}-"
+        r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+        transformers.utils.hub._raise_for_status(r)
+        content_length = r.headers.get("Content-Length")
+        total = resume_size + int(content_length) if content_length is not None else None
+        # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()`
+        # and can be set using `utils.logging.enable/disable_progress_bar()`
+        if url[-11:] != 'config.json':
+            progress = tqdm.tqdm(
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+                total=total,
+                initial=resume_size,
+                desc=f"Downloading {file_name}" if file_name is not None else "Downloading",
+                file=Send_to_socketio(),
+            )
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
+                if url[-11:] != 'config.json':
+                    progress.update(len(chunk))
+                temp_file.write(chunk)
+        if url[-11:] != 'config.json':
+            progress.close()
+
+    transformers.utils.hub.http_get = http_get
+
+
+def patch_transformers():
+    global transformers
+    
+    patch_transformers_download()
+    
+    old_from_pretrained = PreTrainedModel.from_pretrained.__func__
+    @classmethod
+    def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        utils.num_shards = None
+        utils.current_shard = 0
+        utils.from_pretrained_model_name = pretrained_model_name_or_path
+        utils.from_pretrained_index_filename = None
+        utils.from_pretrained_kwargs = kwargs
+        utils.bar = None
+        if utils.args is None or not utils.args.no_aria2:
+            utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
+        return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
+    if(not hasattr(PreTrainedModel, "_kai_patched")):
+        PreTrainedModel.from_pretrained = new_from_pretrained
+        PreTrainedModel._kai_patched = True
+    if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
+        old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
+        def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
+            utils.num_shards = utils.get_num_shards(index_filename)
+            utils.from_pretrained_index_filename = index_filename
+            return old_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs)
+        modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files
+        
+    # Some versions of transformers 4.17.0.dev0 are affected by
+    # https://github.com/huggingface/transformers/issues/15736
+    # This is a workaround for those versions of transformers.
+    if(transformers_version == "4.17.0.dev0"):
+        try:
+            from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding
+        except ImportError:
+            pass
+        else:
+            @torch.no_grad()
+            def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0):
+                bsz, seq_len = inputs_embeds.size()[:-1]
+                input_shape = inputs_embeds.size()[:-1]
+                sequence_length = input_shape[1]
+                position_ids = torch.arange(
+                    past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+                ).unsqueeze(0).expand(input_shape).contiguous()
+                max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+                if max_pos > self.weights.size(0):
+                    self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+                return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+            XGLMSinusoidalPositionalEmbedding.forward = new_forward
+
+
+    # Fix a bug in OPTForCausalLM where self.lm_head is the wrong size
+    if(packaging.version.parse("4.19.0.dev0") <= packaging.version.parse(transformers_version) < packaging.version.parse("4.20.0")):
+        try:
+            from transformers import OPTForCausalLM, OPTModel
+        except ImportError:
+            pass
+        else:
+            # This is the same as the original __init__ but with
+            # config.hidden_size
+            # replaced with
+            # config.word_embed_proj_dim
+            def new_init(self, config):
+                super(OPTForCausalLM, self).__init__(config)
+                self.model = OPTModel(config)
+                self.lm_head = torch.nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+                self.post_init()
+            OPTForCausalLM.__init__ = new_init
+
+
+def move_model_to_devices(model, usegpu, gpu_device):
+    global generator
+
+    if(not utils.HAS_ACCELERATE and not USE_BREAKMODEL):
+        if(usegpu):
+            model = model.half().to(gpu_device)
+        else:
+            model = model.to('cpu').float()
+        generator = model.generate
+        return
+
+    import breakmodel
+
+    if(utils.HAS_ACCELERATE):
+        import accelerate.utils
+        for key, value in model.state_dict().items():
+            target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
+            if(value.dtype is not target_dtype):
+                accelerate.utils.set_module_tensor_to_device(model, key, target_dtype)
+        disk_blocks = breakmodel.disk_blocks
+        gpu_blocks = breakmodel.gpu_blocks
+        ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
+        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
+        device_map = {}
+        for name in utils.layers_module_names:
+            layer = int(name.rsplit(".", 1)[1])
+            device = ("disk" if layer < disk_blocks else "cpu") if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
+            device_map[name] = device
+        for name in utils.get_missing_module_names(model, list(device_map.keys())):
+            device_map[name] = breakmodel.primary_device
+        breakmodel.dispatch_model_ex(model, device_map, main_device=breakmodel.primary_device, offload_buffers=True, offload_dir="accelerate-disk-cache")
+        gc.collect()
+        generator = model.generate
+        return
+
+    model.half()
+    gc.collect()
+
+    if(hasattr(model, "transformer")):
+        model.transformer.wte.to(breakmodel.primary_device)
+        model.transformer.ln_f.to(breakmodel.primary_device)
+        if(hasattr(model, 'lm_head')):
+            model.lm_head.to(breakmodel.primary_device)
+        if(hasattr(model.transformer, 'wpe')):
+            model.transformer.wpe.to(breakmodel.primary_device)
+    elif(not hasattr(model.model, "decoder")):
+        model.model.embed_tokens.to(breakmodel.primary_device)
+        model.model.layer_norm.to(breakmodel.primary_device)
+        model.lm_head.to(breakmodel.primary_device)
+        model.model.embed_positions.to(breakmodel.primary_device)
+    else:
+        model.model.decoder.embed_tokens.to(breakmodel.primary_device)
+        if(model.model.decoder.project_in is not None):
+            model.model.decoder.project_in.to(breakmodel.primary_device)
+        if(model.model.decoder.project_out is not None):
+            model.model.decoder.project_out.to(breakmodel.primary_device)
+        model.model.decoder.embed_positions.to(breakmodel.primary_device)
+    gc.collect()
+    GPTNeoModel.forward = breakmodel.new_forward_neo
+    if("GPTJModel" in globals()):
+        GPTJModel.forward = breakmodel.new_forward_neo # type: ignore
+    if("XGLMModel" in globals()):
+        XGLMModel.forward = breakmodel.new_forward_xglm # type: ignore
+    if("OPTDecoder" in globals()):
+        OPTDecoder.forward = breakmodel.new_forward_opt # type: ignore
+    generator = model.generate
+    if(hasattr(model, "transformer")):
+        breakmodel.move_hidden_layers(model.transformer)
+    elif(not hasattr(model.model, "decoder")):
+        breakmodel.move_hidden_layers(model.model, model.model.layers)
+    else:
+        breakmodel.move_hidden_layers(model.model.decoder, model.model.decoder.layers)


 _PromptTuningPreTrainedModel = Union["UniversalPromptTuningMixin", GPTPromptTuningMixin, transformers.PreTrainedModel]
@@ -259,16 +493,20 @@ class TrainerBase(abc.ABC):
        if "quiet" not in kwargs:
            kwargs["quiet"] = self.quiet
        raise ConfigurationError(msg, **kwargs)
-
-    def get_hf_checkpoint_metadata(self) -> bool:
+    
+    def _get_model_config(self) -> transformers.configuration_utils.PretrainedConfig:
        REVISION = None
-        params = {}
        if(os.path.isdir(self.data.ckpt_path)):
            model_config     = AutoConfig.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
        elif(os.path.isdir("models/{}".format(self.data.ckpt_path.replace('/', '_')))):
            model_config     = AutoConfig.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
        else:
            model_config     = AutoConfig.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+        return model_config
+
+    def get_hf_checkpoint_metadata(self) -> bool:
+        params = {}
+        model_config = self._get_model_config()
        params["tokenizer_id"] = self.data.ckpt_path
        tokenizer = get_tokenizer(self.data.ckpt_path)
        params["newlinemode"] = params.get(
@@ -467,7 +705,17 @@ class TrainerBase(abc.ABC):
            if isinstance(output_file, str):
                f.close()

-    def train(self):
+    def train(
+        self,
+        breakmodel_primary_device: Optional[Union[str, int, torch.device]] = None,
+        breakmodel_gpulayers: Optional[List[int]] = None,
+        breakmodel_disklayers = 0,
+    ):
+        if breakmodel_gpulayers is None:
+            breakmodel_gpulayers = []
+        if breakmodel_primary_device is None:
+            breakmodel_primary_device = 0 if breakmodel_gpulayers else "cpu"
+
        if self.data.params is not None and "max_batch_size" not in self.data.params:
            self.data.params["max_batch_size"] = 2048

@@ -498,30 +746,169 @@ class TrainerBase(abc.ABC):

        REVISION = None

-        tokenizer = self.get_tokenizer()
+        patch_transformers()
+
        model: _PromptTuningPreTrainedModel

-        if(os.path.isdir(self.data.ckpt_path)):
+        model_config = self._get_model_config()
+        n_layers = utils.num_layers(model_config)
+        convert_to_float16 = True
+        hascuda = torch.cuda.is_available()
+        usegpu = not breakmodel_disklayers and len(breakmodel_gpulayers) == 1 and breakmodel_gpulayers[0] == n_layers
+        gpu_device = breakmodel_primary_device
+
+        breakmodel.disk_blocks = breakmodel_disklayers
+        disk_blocks = breakmodel.disk_blocks
+        gpu_blocks = breakmodel.gpu_blocks
+        ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
+        cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
+
+        def lazy_load_callback(model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], f, **_):
+            if lazy_load_callback.nested:
+                return
+            lazy_load_callback.nested = True
+
+            device_map: Dict[str, Union[str, int]] = {}
+
+            @functools.lru_cache(maxsize=None)
+            def get_original_key(key):
+                return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+
+            for key, value in model_dict.items():
+                original_key = get_original_key(key)
+                if isinstance(value, torch_lazy_loader.LazyTensor) and not any(original_key.startswith(n) for n in utils.layers_module_names):
+                    device_map[key] = gpu_device if hascuda and usegpu else "cpu" if not hascuda or not USE_BREAKMODEL else breakmodel.primary_device
+                else:
+                    layer = int(max((n for n in utils.layers_module_names if original_key.startswith(n)), key=len).rsplit(".", 1)[1])
+                    device = gpu_device if hascuda and usegpu else "disk" if layer < disk_blocks and layer < ram_blocks else "cpu" if not hascuda or not USE_BREAKMODEL else "shared" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
+                    device_map[key] = device
+
+            if utils.num_shards is None or utils.current_shard == 0:
+                utils.offload_index = {}
+                if utils.HAS_ACCELERATE:
+                    if os.path.isdir("accelerate-disk-cache"):
+                        # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
+                        # (the folder doesn't contain any subfolders so os.remove will do just fine)
+                        for filename in os.listdir("accelerate-disk-cache"):
+                            try:
+                                os.remove(os.path.join("accelerate-disk-cache", filename))
+                            except OSError:
+                                pass
+                    os.makedirs("accelerate-disk-cache", exist_ok=True)
+                if utils.num_shards is not None:
+                    num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
+                else:
+                    num_tensors = len(device_map)
+                print(flush=True)
+                utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio())
+
+            with zipfile.ZipFile(f, "r") as z:
+                try:
+                    last_storage_key = None
+                    f = None
+                    current_offset = 0
+                    able_to_pin_layers = True
+                    if utils.num_shards is not None:
+                        utils.current_shard += 1
+                    for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
+                        storage_key = model_dict[key].key
+                        if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset:
+                            last_storage_key = storage_key
+                            if isinstance(f, zipfile.ZipExtFile):
+                                f.close()
+                            f = z.open(f"archive/data/{storage_key}")
+                            current_offset = 0
+                        if current_offset != model_dict[key].seek_offset:
+                            f.read(model_dict[key].seek_offset - current_offset)
+                            current_offset = model_dict[key].seek_offset
+                        device = device_map[key]
+                        size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1)
+                        dtype = model_dict[key].dtype
+                        nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
+                        #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
+                        model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
+                        # if model_dict[key].dtype is torch.float32:
+                        #     fp32_model = True
+                        if convert_to_float16 and breakmodel.primary_device != "cpu" and hascuda and (USE_BREAKMODEL or usegpu) and model_dict[key].dtype is torch.float32:
+                            model_dict[key] = model_dict[key].to(torch.float16)
+                        if breakmodel.primary_device == "cpu" or (not usegpu and not USE_BREAKMODEL and model_dict[key].dtype is torch.float16):
+                            model_dict[key] = model_dict[key].to(torch.float32)
+                        if device == "shared":
+                            model_dict[key] = model_dict[key].to("cpu").detach_()
+                            if able_to_pin_layers and utils.HAS_ACCELERATE:
+                                try:
+                                    model_dict[key] = model_dict[key].pin_memory()
+                                except:
+                                    able_to_pin_layers = False
+                        elif device == "disk":
+                            accelerate.utils.offload_weight(model_dict[key], get_original_key(key), "accelerate-disk-cache", index=utils.offload_index)
+                            model_dict[key] = model_dict[key].to("meta")
+                        else:
+                            model_dict[key] = model_dict[key].to(device)
+                        #print("OK", flush=True)
+                        current_offset += nbytes
+                        utils.bar.update(1)
+                finally:
+                    if utils.num_shards is None or utils.current_shard >= utils.num_shards:
+                        if utils.offload_index:
+                            for name, tensor in utils.named_buffers:
+                                if name not in utils.offload_index:
+                                    accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                            accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
+                        utils.bar.close()
+                        utils.bar = None
+                    lazy_load_callback.nested = False
+                    if isinstance(f, zipfile.ZipExtFile):
+                        f.close()
+
+        lazy_load_callback.nested = False
+
+        # Since we're using lazy loader, we need to figure out what the model's hidden layers are called
+        with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True):
            try:
-                model     = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+                metamodel = AutoModelForCausalLM.from_config(model_config)
            except Exception as e:
-                if("out of memory" in traceback.format_exc().lower()):
-                    raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                model     = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
-        elif(os.path.isdir("models/{}".format(self.data.ckpt_path.replace('/', '_')))):
-            try:
-                model     = AutoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
-            except Exception as e:
-                if("out of memory" in traceback.format_exc().lower()):
-                    raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                model     = GPTNeoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
+                metamodel = GPTNeoForCausalLM.from_config(model_config)
+            utils.layers_module_names = utils.get_layers_module_names(metamodel)
+            utils.module_names = list(metamodel.state_dict().keys())
+            utils.named_buffers = list(metamodel.named_buffers(recurse=True))
+
+        with torch_lazy_loader.use_lazy_torch_load(callback=lazy_load_callback, dematerialized_modules=True):
+            if(os.path.isdir(self.data.ckpt_path)):
+                try:
+                    model     = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+                except Exception as e:
+                    if("out of memory" in traceback.format_exc().lower()):
+                        raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
+                    model     = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+            elif(os.path.isdir("models/{}".format(self.data.ckpt_path.replace('/', '_')))):
+                try:
+                    model     = AutoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
+                except Exception as e:
+                    if("out of memory" in traceback.format_exc().lower()):
+                        raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
+                    model     = GPTNeoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
+            else:
+                try:
+                    model     = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+                except Exception as e:
+                    if("out of memory" in traceback.format_exc().lower()):
+                        raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
+                    model     = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+        
+        if(hascuda):
+            if(usegpu):
+                model = model.half().to(gpu_device)
+            elif(breakmodel):  # Use both RAM and VRAM (breakmodel)
+                move_model_to_devices(model, usegpu, gpu_device)
+            elif(__import__("breakmodel").disk_blocks > 0):
+                move_model_to_devices(model, usegpu, gpu_device)
+            else:
+                model = model.to('cpu').float()
+        elif(__import__("breakmodel").disk_blocks > 0):
+            move_model_to_devices(model, usegpu, gpu_device)
        else:
-            try:
-                model     = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
-            except Exception as e:
-                if("out of memory" in traceback.format_exc().lower()):
-                    raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                model     = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
+            model.to('cpu').float()

        if step == 0:
            soft_embeddings = self.get_initial_soft_embeddings(model)
--- a/utils.py
+++ b/utils.py
@@ -22,6 +22,7 @@ except ImportError:
    HAS_ACCELERATE = False

 vars = None
+args = None
 num_shards: Optional[int] = None
 current_shard = 0
 from_pretrained_model_name = ""
@@ -35,6 +36,8 @@ named_buffers: Optional[List[tuple]] = None

 default_sampler_order = [0, 1, 2, 3, 4, 5]

+emit = None
+
 #==================================================================#
 # Decorator to prevent a function's actions from being run until
 # at least x seconds have passed without the function being called