From 69d942c00cfd16708f82826fcc0d50355e322c0f Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Thu, 11 May 2023 20:22:30 -0400
Subject: [PATCH] Kind of working breakmodel

---
 aiserver.py                                   | 256 +-----------------
 koboldai_settings.py                          |   3 +-
 modeling/inference_models/generic_hf_torch.py |   7 +-
 modeling/inference_models/gooseai.py          |  31 +++
 modeling/inference_models/hf_mtj.py           |   2 +-
 modeling/inference_models/openai.py           | 168 +-----------
 modeling/inference_models/parents/hf.py       |  35 ++-
 modeling/inference_models/parents/hf_torch.py |  27 +-
 .../parents/openai_gooseai.py                 | 189 +++++++++++++
 static/koboldai.js                            |   6 +
 10 files changed, 281 insertions(+), 443 deletions(-)
 create mode 100644 modeling/inference_models/gooseai.py
 create mode 100644 modeling/inference_models/parents/openai_gooseai.py

diff --git a/aiserver.py b/aiserver.py
index f9e60641..158a6699 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1473,7 +1473,7 @@ def general_startup(override_args=None):
         koboldai_vars.quiet = True
 
     if args.nobreakmodel:
-        koboldai_vars.nobreakmodel = True
+        model_loaders['generic_hf_torch'].nobreakmodel = True
 
     if args.remote:
         koboldai_vars.host = True;
@@ -1484,6 +1484,9 @@ def general_startup(override_args=None):
     if args.localtunnel:
         koboldai_vars.host = True;
 
+    if args.lowmem:
+        model_loaders['generic_hf_torch'].low_mem = True
+
     if args.host != "Disabled":
             # This means --host option was submitted without an argument
             # Enable all LAN IPs (0.0.0.0/0)
@@ -1516,6 +1519,9 @@ def general_startup(override_args=None):
         koboldai_vars.trust_remote_code = True
     if args.cpu:
         koboldai_vars.use_colab_tpu = False
+        koboldai_vars.hascuda = False
+        koboldai_vars.usegpu = False
+        model_loaders['generic_hf_torch'].nobreakmodel = True
 
     koboldai_vars.smandelete = koboldai_vars.host == args.override_delete
     koboldai_vars.smanrename = koboldai_vars.host == args.override_rename
@@ -1545,245 +1551,6 @@ def general_startup(override_args=None):
     socketio.start_background_task(socket_io_relay, koboldai_settings.queue, socketio)
     
         
-#==================================================================#
-# Load Model
-#==================================================================# 
-
-@socketio.on("get_model_info")
-def get_model_info(model, directory=""):
-    logger.info("Selected: {}, {}".format(model, directory))
-    # if the model is in the api list
-    disk_blocks = 0
-    key = False
-    breakmodel = False
-    gpu = False
-    layer_count = None
-    key_value = ""
-    break_values = []
-    url = False
-    default_url = None
-    models_on_url = False
-    multi_online_models = False
-    show_online_model_select=False
-    gpu_count = torch.cuda.device_count()
-    gpu_names = []
-    send_horde_models = False
-    show_custom_model_box = False
-    for i in range(gpu_count):
-        gpu_names.append(torch.cuda.get_device_name(i))
-    if model in ['Colab', 'API']:
-        url = True
-    elif model == 'CLUSTER':
-        models_on_url = True
-        show_online_model_select=True
-        url = True
-        key = True
-        default_url = koboldai_vars.horde_url
-        multi_online_models = True
-        key_value = koboldai_vars.horde_api_key
-        url = koboldai_vars.horde_url
-        if key_value:
-            send_horde_models = True
-    elif model in [x.name for x in model_menu['apilist']]:
-        show_online_model_select=True
-        if path.exists("settings/{}.v2_settings".format(model)):
-            with open("settings/{}.v2_settings".format(model), "r") as file:
-                # Check if API key exists
-                try:
-                    js = json.load(file)
-
-                    if("apikey" in js and js["apikey"] != ""):
-                        # API key exists, grab it and close the file
-                        key_value = js["apikey"]
-                    elif 'oaiapikey' in js and js['oaiapikey'] != "":
-                        key_value = js["oaiapikey"]
-                    if model in ('GooseAI', 'OAI'): 
-                        get_oai_models({'model': model, 'key': key_value})
-                except json.decoder.JSONDecodeError:
-                    print(":(")
-                    pass
-        key = True
-    elif "rwkv" in model.lower():
-        pass
-    elif model == 'ReadOnly':
-        pass
-    #elif model == 'customhuggingface':
-    #    show_custom_model_box = True
-    elif args.cpu:
-        pass
-    else:
-        layer_count = get_layer_count(model, directory=directory)
-        if layer_count is None:
-            breakmodel = False
-            gpu = True
-        else:
-            breakmodel = True
-            if model in ["NeoCustom", "GPT2Custom", "customhuggingface"]:
-                filename = "settings/{}.breakmodel".format(os.path.basename(os.path.normpath(directory)))
-            else:
-                filename = "settings/{}.breakmodel".format(model.replace("/", "_"))
-            if path.exists(filename):
-                with open(filename, "r") as file:
-                    data = [x for x in file.read().split("\n")[:2] if x != '']
-                    if len(data) < 2:
-                        data.append("0")
-                    break_values, disk_blocks = data
-                    break_values = break_values.split(",")
-            else:
-                break_values = [layer_count]
-            break_values = [int(x) for x in break_values if x != '']
-            break_values += [0] * (gpu_count - len(break_values))
-    emit('from_server', {'cmd': 'selected_model_info', 'key_value': key_value, 'key':key, 'multi_online_models': multi_online_models, 'default_url': default_url, 
-                         'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 
-                         'disk_break_value': disk_blocks, 'accelerate': True,
-                         'break_values': break_values, 'gpu_count': gpu_count,
-                         'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url,
-                         'show_custom_model_box': show_custom_model_box}, broadcast=True, room="UI_1")
-    emit('selected_model_info', {'key_value': key_value, 'key':key, 
-                         'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 'multi_online_models': multi_online_models, 'default_url': default_url, 
-                         'disk_break_value': disk_blocks, 'disk_break': True,
-                         'break_values': break_values, 'gpu_count': gpu_count,
-                         'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
-                         'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
-                         'show_custom_model_box': show_custom_model_box})
-    if send_horde_models:
-        get_cluster_models({'key': key_value, 'url': default_url})
-    elif key_value != "" and model in [x.name for x in model_menu['apilist']] and model != 'CLUSTER':
-        get_oai_models(key_value)
-    
-    
-
-def get_layer_count(model, directory=""):
-    if(model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]):
-        if(model == "GPT2Custom"):
-            with open(os.path.join(directory, "config.json"), "r") as f:
-                model_config = json.load(f)
-        # Get the model_type from the config or assume a model type if it isn't present
-        else:
-            if(directory):
-                model = directory
-            from transformers import AutoConfig
-            if(os.path.isdir(model.replace('/', '_'))):
-                model_config = AutoConfig.from_pretrained(model.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache")
-            elif(is_model_downloaded(model)):
-                model_config = AutoConfig.from_pretrained("models/{}".format(model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache")
-            elif(os.path.isdir(directory)):
-                model_config = AutoConfig.from_pretrained(directory, revision=koboldai_vars.revision, cache_dir="cache")
-            elif(os.path.isdir(koboldai_vars.custmodpth.replace('/', '_'))):
-                model_config = AutoConfig.from_pretrained(koboldai_vars.custmodpth.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache")
-            else:
-                model_config = AutoConfig.from_pretrained(model, revision=koboldai_vars.revision, cache_dir="cache")
-        try:
-            if (model_config.model_type != 'gpt2' or model_config.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not koboldai_vars.nobreakmodel:
-                return utils.num_layers(model_config)
-            else:
-                return None
-        except:
-            return None
-    else:
-        return None
-
-@socketio.on('OAI_Key_Update')
-def get_oai_models(data):
-    key = data['key']
-    model = data['model']
-    koboldai_vars.oaiapikey = key
-    if model == 'OAI':
-        url = "https://api.openai.com/v1/engines"
-    elif model == 'GooseAI':
-        url = "https://api.goose.ai/v1/engines"
-    else:
-        return
-        
-    # Get list of models from OAI
-    logger.init("OAI Engines", status="Retrieving")
-    req = requests.get(
-        url, 
-        headers = {
-            'Authorization': 'Bearer '+key
-            }
-        )
-    if(req.status_code == 200):
-        r = req.json()
-        engines = r["data"]
-        try:
-            engines = [[en["id"], "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")] for en in engines]
-        except:
-            logger.error(engines)
-            raise
-        
-        online_model = ""
-        changed=False
-        
-        #Save the key
-        if not path.exists("settings"):
-            # If the client settings file doesn't exist, create it
-            # Write API key to file
-            os.makedirs('settings', exist_ok=True)
-        if path.exists("settings/{}.v2_settings".format(model)):
-            with open("settings/{}.v2_settings".format(model), "r") as file:
-                js = json.load(file)
-                if 'online_model' in js:
-                    online_model = js['online_model']
-                if "apikey" in js:
-                    if js['apikey'] != key:
-                        changed=True
-        else:
-            js = {}
-            changed=True
-
-        if changed:
-            with open("settings/{}.v2_settings".format(model), "w") as file:
-                js["apikey"] = key
-                file.write(json.dumps(js, indent=3))
-            
-        logger.init_ok("OAI Engines", status="OK")
-        emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True, room="UI_1")
-        emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
-    else:
-        # Something went wrong, print the message and quit since we can't initialize an engine
-        logger.init_err("OAI Engines", status="Failed")
-        logger.error(req.json())
-        emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
-
-@socketio.on("get_cluster_models")
-def get_cluster_models(msg):
-    koboldai_vars.horde_api_key = msg['key'] or koboldai_vars.horde_api_key
-    url = msg['url'] or koboldai_vars.horde_url
-    koboldai_vars.horde_url = url
-    # Get list of models from public cluster
-    print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="")
-    try:
-        req = requests.get(f"{url}/api/v2/status/models?type=text")
-    except:
-        logger.init_err("KAI Horde Models", status="Failed")
-        logger.error("Provided KoboldAI Horde URL unreachable")
-        emit('from_server', {'cmd': 'errmsg', 'data': "Provided KoboldAI Horde URL unreachable"})
-        return
-    if not req.ok:
-        # Something went wrong, print the message and quit since we can't initialize an engine
-        logger.init_err("KAI Horde Models", status="Failed")
-        logger.error(req.json())
-        emit('from_server', {'cmd': 'errmsg', 'data': req.json()}, room="UI_1")
-        return
-
-    engines = req.json()
-    logger.debug(engines)
-    try:
-        engines = [[en["name"], en["name"]] for en in engines]
-    except:
-        logger.error(engines)
-        raise
-    logger.debug(engines)
-    
-    online_model = ""
-    savesettings()
-
-    logger.init_ok("KAI Horde Models", status="OK")
-
-    emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True, room="UI_1")
-    emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
-
     
 def unload_model():
     global model
@@ -1845,7 +1612,6 @@ def load_model(plugin, initial_load=False):
         # loadmodelsettings()
         # loadsettings()
         logger.init("GPU support", status="Searching")
-        koboldai_vars.hascuda = torch.cuda.is_available() and not args.cpu
         koboldai_vars.bmsupported = ((koboldai_vars.model_type != 'gpt2') or koboldai_vars.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not koboldai_vars.nobreakmodel
         if(args.breakmodel is not None and args.breakmodel):
             logger.warning("--breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).")
@@ -1861,12 +1627,7 @@ def load_model(plugin, initial_load=False):
         else:
             logger.init_warn("GPU support", status="Not Found")
         
-        if args.cpu:
-            koboldai_vars.usegpu = False
-            gpu_layers = None
-            disk_layers = None
-            koboldai_vars.breakmodel = False
-        elif koboldai_vars.hascuda:
+        if koboldai_vars.hascuda:
             if(koboldai_vars.bmsupported):
                 koboldai_vars.usegpu = False
                 koboldai_vars.breakmodel = True
@@ -1879,6 +1640,7 @@ def load_model(plugin, initial_load=False):
                     
     model = model_loaders[plugin]
     model.load(initial_load=initial_load)
+    logger.debug("Model Type: {}".format(koboldai_vars.model_type))
     
     # TODO: Convert everywhere to use model.tokenizer
     if model:
diff --git a/koboldai_settings.py b/koboldai_settings.py
index d8416df2..e9562ffc 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -710,7 +710,6 @@ class model_settings(settings):
         self.modeldim    = -1     # Embedding dimension of your model (e.g. it's 4096 for GPT-J-6B and 2560 for GPT-Neo-2.7B)
         self.sampler_order = [6, 0, 1, 2, 3, 4, 5]
         self.newlinemode = "n"
-        self.lazy_load   = True # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
         self.presets     = []   # Holder for presets
         self.selected_preset = ""
         self.uid_presets = []
@@ -1236,7 +1235,7 @@ class system_settings(settings):
         self.corescript  = "default.lua"  # Filename of corescript to load
         self.gpu_device  = 0      # Which PyTorch device to use when using pure GPU generation
         self.savedir     = os.getcwd()+"\\stories"
-        self.hascuda     = False  # Whether torch has detected CUDA on the system
+        self.hascuda     = torch.cuda.is_available()  # Whether torch has detected CUDA on the system
         self.usegpu      = False  # Whether to launch pipeline with GPU support
         self.splist      = []
         self.spselect    = ""     # Temporary storage for soft prompt filename to load
diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py
index d5cf6397..c228e2ee 100644
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -30,7 +30,6 @@ class model_loader(HFTorchInferenceModel):
     
     def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
-        self.lazy_load = utils.koboldai_vars.lazy_load
 
         # Make model path the same as the model name to make this consistent
         # with the other loading method if it isn't a known model type. This
@@ -69,12 +68,14 @@ class model_loader(HFTorchInferenceModel):
 
         # If we're using torch_lazy_loader, we need to get breakmodel config
         # early so that it knows where to load the individual model tensors
+        logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
         if (
             self.lazy_load
             and utils.koboldai_vars.hascuda
-            and utils.koboldai_vars.breakmodel
-            and not utils.koboldai_vars.nobreakmodel
+            and self.breakmodel
+            and not self.nobreakmodel
         ):
+            logger.debug("loading breakmodel")
             self.breakmodel_device_config(self.model_config)
 
         if self.lazy_load:
diff --git a/modeling/inference_models/gooseai.py b/modeling/inference_models/gooseai.py
new file mode 100644
index 00000000..08d8ea06
--- /dev/null
+++ b/modeling/inference_models/gooseai.py
@@ -0,0 +1,31 @@
+import torch
+import requests
+import numpy as np
+from typing import List, Optional, Union
+
+import utils
+from logger import logger
+from modeling.inference_model import (
+    GenerationResult,
+    GenerationSettings,
+    InferenceModel,
+)
+
+from modeling.inference_models.parents.openai_gooseai import model_loader as openai_gooseai_model_loader
+
+
+
+class OpenAIAPIError(Exception):
+    def __init__(self, error_type: str, error_message) -> None:
+        super().__init__(f"{error_type}: {error_message}")
+
+
+class model_loader(openai_gooseai_model_loader):
+    """InferenceModel for interfacing with OpenAI's generation API."""
+    
+    def __init__(self):
+        super().__init__()
+        self.url = "https://api.goose.ai/v1/engines"
+    
+    def is_valid(self, model_name, model_path, menu_path):
+        return  model_name == "GooseAI"
\ No newline at end of file
diff --git a/modeling/inference_models/hf_mtj.py b/modeling/inference_models/hf_mtj.py
index c99e9a05..759feb65 100644
--- a/modeling/inference_models/hf_mtj.py
+++ b/modeling/inference_models/hf_mtj.py
@@ -27,7 +27,7 @@ class model_loader(HFInferenceModel):
         #model_name: str,
     ) -> None:
         super().__init__()
-
+        self.hf_torch = False
         self.model_config = None
         self.capabilties = ModelCapabilities(
             embedding_manipulation=False,
diff --git a/modeling/inference_models/openai.py b/modeling/inference_models/openai.py
index efbb01d3..cad2a7f2 100644
--- a/modeling/inference_models/openai.py
+++ b/modeling/inference_models/openai.py
@@ -11,6 +11,8 @@ from modeling.inference_model import (
     InferenceModel,
 )
 
+from modeling.inference_models.parents.openai_gooseai import model_loader as openai_gooseai_model_loader
+
 
 
 class OpenAIAPIError(Exception):
@@ -18,172 +20,12 @@ class OpenAIAPIError(Exception):
         super().__init__(f"{error_type}: {error_message}")
 
 
-class model_loader(InferenceModel):
+class model_loader(openai_gooseai_model_loader):
     """InferenceModel for interfacing with OpenAI's generation API."""
     
     def __init__(self):
         super().__init__()
-        self.key = ""
+        self.url = "https://api.openai.com/v1/engines"
     
     def is_valid(self, model_name, model_path, menu_path):
-        return model_name == "OAI" or model_name == "GooseAI"
-    
-    def get_requested_parameters(self, model_name, model_path, menu_path):
-        self.source = model_name
-        requested_parameters = []
-        requested_parameters.extend([{
-                                        "uitype": "text",
-                                        "unit": "text",
-                                        "label": "Key",
-                                        "id": "key",
-                                        "default": "",
-                                        "check": {"value": "", 'check': "!="},
-                                        "tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
-                                        "menu_path": "",
-                                        "refresh_model_inputs": True,
-                                        "extra_classes": ""
-                                    },
-                                    {
-                                        "uitype": "dropdown",
-                                        "unit": "text",
-                                        "label": "Model",
-                                        "id": "model",
-                                        "default": "",
-                                        "check": {"value": "", 'check': "!="},
-                                        "tooltip": "Which model to use when running OpenAI/GooseAI.",
-                                        "menu_path": "",
-                                        "refresh_model_inputs": False,
-                                        "extra_classes": "",
-                                        'children': self.get_oai_models(),
-
-                                    }])
-        return requested_parameters
-        
-    def set_input_parameters(self, parameters):
-        self.key = parameters['key'].strip()
-        self.model = parameters['model']
-
-    def get_oai_models(self):
-        if self.key == "":
-            return []
-        if self.source == 'OAI':
-            url = "https://api.openai.com/v1/engines"
-        elif self.source == 'GooseAI':
-            url = "https://api.goose.ai/v1/engines"
-        else:
-            return
-            
-        # Get list of models from OAI
-        logger.init("OAI Engines", status="Retrieving")
-        req = requests.get(
-            url, 
-            headers = {
-                'Authorization': 'Bearer '+self.key
-                }
-            )
-        if(req.status_code == 200):
-            r = req.json()
-            engines = r["data"]
-            try:
-                engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines]
-            except:
-                logger.error(engines)
-                raise
-            
-            online_model = ""
-
-                
-            logger.init_ok("OAI Engines", status="OK")
-            return engines
-        else:
-            # Something went wrong, print the message and quit since we can't initialize an engine
-            logger.init_err("OAI Engines", status="Failed")
-            logger.error(req.json())
-            emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
-            return []
-            
-
-    def _load(self, save_model: bool, initial_load: bool) -> None:
-        self.tokenizer = self._get_tokenizer("gpt2")
-
-    def _raw_generate(
-        self,
-        prompt_tokens: Union[List[int], torch.Tensor],
-        max_new: int,
-        gen_settings: GenerationSettings,
-        single_line: bool = False,
-        batch_count: int = 1,
-        seed: Optional[int] = None,
-        **kwargs,
-    ) -> GenerationResult:
-
-        if seed is not None:
-            logger.warning(
-                "Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored."
-            )
-
-        decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
-
-        # Store context in memory to use it for comparison with generated content
-        utils.koboldai_vars.lastctx = decoded_prompt
-
-        # Build request JSON data
-        # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
-        # as the koboldai_vars.model will always be OAI
-        if "GooseAI" in utils.koboldai_vars.configname:
-            reqdata = {
-                "prompt": decoded_prompt,
-                "max_tokens": max_new,
-                "temperature": gen_settings.temp,
-                "top_a": gen_settings.top_a,
-                "top_p": gen_settings.top_p,
-                "top_k": gen_settings.top_k,
-                "tfs": gen_settings.tfs,
-                "typical_p": gen_settings.typical,
-                "repetition_penalty": gen_settings.rep_pen,
-                "repetition_penalty_slope": gen_settings.rep_pen_slope,
-                "repetition_penalty_range": gen_settings.rep_pen_range,
-                "n": batch_count,
-                # TODO: Implement streaming
-                "stream": False,
-            }
-        else:
-            reqdata = {
-                "prompt": decoded_prompt,
-                "max_tokens": max_new,
-                "temperature": gen_settings.temp,
-                "top_p": gen_settings.top_p,
-                "frequency_penalty": gen_settings.rep_pen,
-                "n": batch_count,
-                "stream": False,
-            }
-
-        req = requests.post(
-            utils.koboldai_vars.oaiurl,
-            json=reqdata,
-            headers={
-                "Authorization": "Bearer " + utils.koboldai_vars.oaiapikey,
-                "Content-Type": "application/json",
-            },
-        )
-
-        j = req.json()
-
-        if not req.ok:
-            # Send error message to web client
-            if "error" in j:
-                error_type = j["error"]["type"]
-                error_message = j["error"]["message"]
-            else:
-                error_type = "Unknown"
-                error_message = "Unknown"
-            raise OpenAIAPIError(error_type, error_message)
-
-        outputs = [out["text"] for out in j["choices"]]
-        return GenerationResult(
-            model=self,
-            out_batches=np.array([self.tokenizer.encode(x) for x in outputs]),
-            prompt=prompt_tokens,
-            is_whole_generation=True,
-            single_line=single_line,
-        )
+        return model_name == "OAI"
\ No newline at end of file
diff --git a/modeling/inference_models/parents/hf.py b/modeling/inference_models/parents/hf.py
index 1941a12e..c7a781d7 100644
--- a/modeling/inference_models/parents/hf.py
+++ b/modeling/inference_models/parents/hf.py
@@ -22,18 +22,19 @@ class HFInferenceModel(InferenceModel):
     def is_valid(self, model_name, model_path, menu_path):
         try:
             if model_path is not None and os.path.exists(model_path):
-                model_config = AutoConfig.from_pretrained(model_path)
+                self.model_config = AutoConfig.from_pretrained(model_path)
             elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
-                model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
+                self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
             else:
-                model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
+                self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
             return True
         except:
             return False
         
     def get_requested_parameters(self, model_name, model_path, menu_path):
         requested_parameters = []
-        
+        if not self.hf_torch:
+            return []
         if model_path is not None and os.path.exists(model_path):
             self.model_config = AutoConfig.from_pretrained(model_path)
         elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
@@ -124,14 +125,20 @@ class HFInferenceModel(InferenceModel):
         return requested_parameters
         
     def set_input_parameters(self, parameters):
-        gpu_count = torch.cuda.device_count()
-        layers = []
-        for i in range(gpu_count):
-            layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None)
-        self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
-        self.layers = layers
-        self.disk_layers = parameters['disk_layers'] if 'disk_layers' in parameters else None
-        self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
+        if self.hf_torch:
+            import breakmodel
+            gpu_count = torch.cuda.device_count()
+            layers = []
+            for i in range(gpu_count):
+                layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None)
+            self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
+            self.layers = layers
+            self.disk_layers = int(parameters['disk_layers']) if 'disk_layers' in parameters and parameters['disk_layers'].isnumeric() else 0    
+            breakmodel.gpu_blocks = layers
+            breakmodel.disk_blocks = self.disk_layers
+            self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
+            self.model_type = self.get_model_type()
+            self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel
         self.model_name = parameters['id']
         self.path = parameters['path'] if 'path' in parameters else None
 
@@ -157,6 +164,10 @@ class HFInferenceModel(InferenceModel):
                 torch.cuda.empty_cache()
         except:
             pass
+        if self.hf_torch:
+            breakmodel.breakmodel = True
+            breakmodel.gpu_blocks = []
+            breakmodel.disk_blocks = 0
 
     def _post_load(self) -> None:
         # These are model specific tokenizer overrides if a model has bad defaults
diff --git a/modeling/inference_models/parents/hf_torch.py b/modeling/inference_models/parents/hf_torch.py
index 7cc16ad5..84c60a6c 100644
--- a/modeling/inference_models/parents/hf_torch.py
+++ b/modeling/inference_models/parents/hf_torch.py
@@ -53,15 +53,12 @@ LOG_SAMPLER_NO_EFFECT = False
 
 
 class HFTorchInferenceModel(HFInferenceModel):
-    def __init__(
-        self,
-        #model_name: str,
-        #lazy_load: bool,
-        #low_mem: bool,
-    ) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        #self.lazy_load = lazy_load
-        #self.low_mem = low_mem
+        self.hf_torch = True
+        self.lazy_load = True
+        self.low_mem = False
+        self.nobreakmodel = False
 
         self.post_token_hooks = [
             PostTokenHooks.stream_tokens,
@@ -398,7 +395,7 @@ class HFTorchInferenceModel(HFInferenceModel):
         Embedding._koboldai_patch_causallm_model = self.model
 
     def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
-        if not utils.koboldai_vars.lazy_load:
+        if not self.lazy_load:
             return
 
         if utils.args.breakmodel_disklayers is not None:
@@ -819,14 +816,14 @@ class HFTorchInferenceModel(HFInferenceModel):
         elif (
             utils.args.breakmodel_gpulayers is not None
             or utils.args.breakmodel_disklayers is not None
+            or breakmodel.gpu_blocks != []
         ):
             try:
-                if not utils.args.breakmodel_gpulayers:
-                    breakmodel.gpu_blocks = []
-                else:
-                    breakmodel.gpu_blocks = list(
-                        map(int, utils.args.breakmodel_gpulayers.split(","))
-                    )
+                if breakmodel.gpu_blocks == []:
+                    if utils.args.breakmodel_gpulayers:
+                        breakmodel.gpu_blocks = list(
+                            map(int, utils.args.breakmodel_gpulayers.split(","))
+                        )
                 assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
                 s = n_layers
                 for i in range(len(breakmodel.gpu_blocks)):
diff --git a/modeling/inference_models/parents/openai_gooseai.py b/modeling/inference_models/parents/openai_gooseai.py
new file mode 100644
index 00000000..621ccbad
--- /dev/null
+++ b/modeling/inference_models/parents/openai_gooseai.py
@@ -0,0 +1,189 @@
+import torch
+import requests
+import numpy as np
+from typing import List, Optional, Union
+
+import utils
+from logger import logger
+from modeling.inference_model import (
+    GenerationResult,
+    GenerationSettings,
+    InferenceModel,
+)
+
+
+
+class OpenAIAPIError(Exception):
+    def __init__(self, error_type: str, error_message) -> None:
+        super().__init__(f"{error_type}: {error_message}")
+
+
+class model_loader(InferenceModel):
+    """InferenceModel for interfacing with OpenAI's generation API."""
+    
+    def __init__(self):
+        super().__init__()
+        self.key = ""
+        self.url = "https://api.goose.ai/v1/engines"
+        #if self.source == 'OAI':
+        #    url = "https://api.openai.com/v1/engines"
+        #elif self.source == 'GooseAI':
+        #    url = "https://api.goose.ai/v1/engines"
+    
+    def is_valid(self, model_name, model_path, menu_path):
+        return model_name == "OAI" or model_name == "GooseAI"
+    
+    def get_requested_parameters(self, model_name, model_path, menu_path):
+        self.source = model_name
+        requested_parameters = []
+        requested_parameters.extend([{
+                                        "uitype": "text",
+                                        "unit": "text",
+                                        "label": "Key",
+                                        "id": "key",
+                                        "default": "",
+                                        "check": {"value": "", 'check': "!="},
+                                        "tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
+                                        "menu_path": "",
+                                        "refresh_model_inputs": True,
+                                        "extra_classes": ""
+                                    },
+                                    {
+                                        "uitype": "dropdown",
+                                        "unit": "text",
+                                        "label": "Model",
+                                        "id": "model",
+                                        "default": "",
+                                        "check": {"value": "", 'check': "!="},
+                                        "tooltip": "Which model to use when running OpenAI/GooseAI.",
+                                        "menu_path": "",
+                                        "refresh_model_inputs": False,
+                                        "extra_classes": "",
+                                        'children': self.get_oai_models(),
+
+                                    }])
+        return requested_parameters
+        
+    def set_input_parameters(self, parameters):
+        self.key = parameters['key'].strip()
+        self.model = parameters['model']
+
+    def get_oai_models(self):
+        if self.key == "":
+            return []
+        
+            
+        # Get list of models from OAI
+        logger.init("OAI Engines", status="Retrieving")
+        req = requests.get(
+            self.url, 
+            headers = {
+                'Authorization': 'Bearer '+self.key
+                }
+            )
+        if(req.status_code == 200):
+            r = req.json()
+            engines = r["data"]
+            try:
+                engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines]
+            except:
+                logger.error(engines)
+                raise
+            
+            online_model = ""
+
+                
+            logger.init_ok("OAI Engines", status="OK")
+            return engines
+        else:
+            # Something went wrong, print the message and quit since we can't initialize an engine
+            logger.init_err("OAI Engines", status="Failed")
+            logger.error(req.json())
+            emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
+            return []
+            
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        self.tokenizer = self._get_tokenizer("gpt2")
+
+    def _raw_generate(
+        self,
+        prompt_tokens: Union[List[int], torch.Tensor],
+        max_new: int,
+        gen_settings: GenerationSettings,
+        single_line: bool = False,
+        batch_count: int = 1,
+        seed: Optional[int] = None,
+        **kwargs,
+    ) -> GenerationResult:
+
+        if seed is not None:
+            logger.warning(
+                "Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored."
+            )
+
+        decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
+
+        # Store context in memory to use it for comparison with generated content
+        utils.koboldai_vars.lastctx = decoded_prompt
+
+        # Build request JSON data
+        # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
+        # as the koboldai_vars.model will always be OAI
+        if "GooseAI" in utils.koboldai_vars.configname:
+            reqdata = {
+                "prompt": decoded_prompt,
+                "max_tokens": max_new,
+                "temperature": gen_settings.temp,
+                "top_a": gen_settings.top_a,
+                "top_p": gen_settings.top_p,
+                "top_k": gen_settings.top_k,
+                "tfs": gen_settings.tfs,
+                "typical_p": gen_settings.typical,
+                "repetition_penalty": gen_settings.rep_pen,
+                "repetition_penalty_slope": gen_settings.rep_pen_slope,
+                "repetition_penalty_range": gen_settings.rep_pen_range,
+                "n": batch_count,
+                # TODO: Implement streaming
+                "stream": False,
+            }
+        else:
+            reqdata = {
+                "prompt": decoded_prompt,
+                "max_tokens": max_new,
+                "temperature": gen_settings.temp,
+                "top_p": gen_settings.top_p,
+                "frequency_penalty": gen_settings.rep_pen,
+                "n": batch_count,
+                "stream": False,
+            }
+
+        req = requests.post(
+            self.url,
+            json=reqdata,
+            headers={
+                "Authorization": "Bearer " + self.key,
+                "Content-Type": "application/json",
+            },
+        )
+
+        j = req.json()
+
+        if not req.ok:
+            # Send error message to web client
+            if "error" in j:
+                error_type = j["error"]["type"]
+                error_message = j["error"]["message"]
+            else:
+                error_type = "Unknown"
+                error_message = "Unknown"
+            raise OpenAIAPIError(error_type, error_message)
+
+        outputs = [out["text"] for out in j["choices"]]
+        return GenerationResult(
+            model=self,
+            out_batches=np.array([self.tokenizer.encode(x) for x in outputs]),
+            prompt=prompt_tokens,
+            is_whole_generation=True,
+            single_line=single_line,
+        )
diff --git a/static/koboldai.js b/static/koboldai.js
index 7f004ff2..ab7f7832 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1652,6 +1652,12 @@ function selected_model_info(data) {
 	while (loadmodelsettings.firstChild) {
 		loadmodelsettings.removeChild(loadmodelsettings.firstChild);
 	}
+	//Clear out plugin selector
+	var model_plugin = document.getElementById('modelplugin');
+	while (model_plugin.firstChild) {
+		model_plugin.removeChild(model_plugin.firstChild);
+	}
+	
 	var accept = document.getElementById("btn_loadmodelaccept");
 	accept.disabled = false;