diff --git a/.gitmodules b/.gitmodules
index 0107a8c3..c6f4b308 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "KoboldAI-Horde-Bridge"]
 	path = KoboldAI-Horde-Bridge
 	url = https://github.com/db0/KoboldAI-Horde-Bridge
+[submodule "repos/gptq"]
+	path = repos/gptq
+	url = https://github.com/0cc4m/GPTQ-for-LLaMa
+	branch = a8303654c200c25577130466e5f9bc1e70fc8a50
diff --git a/README.md b/README.md
index 789b78d1..0657fa0b 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,36 @@
+## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
+
+### Install/Use Guide
+(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
+
+In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+
+`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
+
+`cd KoboldAI`
+
+Next step, (Windows) subfolder mode or B: option doesn't matter choose either
+
+[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+
+[if on Linux] `install_requirements.sh`
+
+If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
+
+Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+
+Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
+
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
+
+If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
+
+Run `play.bat` [windows] or `play.sh` [linux]
+
+Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
+
+The 4bit toggle shows when a model to load is selected.
+
 ## KoboldAI - Your gateway to GPT writing
 
 This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.
diff --git a/aiserver.py b/aiserver.py
index 6b243efb..7e9241f5 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,6 +87,18 @@ allowed_ips = set()  # empty set
 enable_whitelist = False
 
 
+# 4-bit dependencies
+from pathlib import Path
+import glob
+sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
+from gptj import load_quant as gptj_load_quant
+from gptneox import load_quant as gptneox_load_quant
+from llama import load_quant as llama_load_quant
+from opt import load_quant as opt_load_quant
+from offload import load_quant_offload
+monkey_patched_4bit = False
+
+
 if lupa.LUA_VERSION[:2] != (5, 4):
     logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
 
@@ -1002,8 +1014,6 @@ def getmodelname():
 def get_hidden_size_from_model(model):
     return model.get_input_embeddings().embedding_dim
 
-
-
 #==================================================================#
 #  Allow the models to override some settings
 #==================================================================#
@@ -1305,7 +1315,7 @@ def general_startup(override_args=None):
     parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI")
     parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
     parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
-    parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
+    parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
     parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
     parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
     parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
@@ -1436,17 +1446,14 @@ def general_startup(override_args=None):
     if args.localtunnel:
         koboldai_vars.host = True;
 
-    if args.host == "":
-        koboldai_vars.host = True
-        args.unblock = True
-    if args.host:
+    if args.host != "Disabled":
             # This means --host option was submitted without an argument
             # Enable all LAN IPs (0.0.0.0/0)
+        koboldai_vars.host = True
+        args.unblock = True
         if args.host != "":
             # Check if --host option was submitted with an argument
             # Parse the supplied IP(s) and add them to the allowed IPs list
-            koboldai_vars.host = True
-            args.unblock = True
             enable_whitelist = True
             for ip_str in args.host.split(","):
                 if "/" in ip_str:
@@ -1463,6 +1470,7 @@ def general_startup(override_args=None):
             print(f"Allowed IPs: {allowed_ips}")
 
 
+
     if args.cpu:
         koboldai_vars.use_colab_tpu = False
 
@@ -1594,6 +1602,7 @@ def get_model_info(model, directory=""):
                          'break_values': break_values, 'gpu_count': gpu_count,
                          'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
                          'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
+                         'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
                          'show_custom_model_box': show_custom_model_box})
     if send_horde_models:
         get_cluster_models({'key': key_value, 'url': default_url})
@@ -1767,9 +1776,59 @@ def unload_model():
         
     #Reload our badwords
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
+
+
+def prepare_4bit_load(modelpath):
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    groupsize = -1
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
+            if g:
+                groupsize = int(g[0])
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print("4-bit file not found, falling back to old format.")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError(f"4-bit load failed. PT-File not found.")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result, groupsize
     
     
-def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
+def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
     global model
     global tokenizer
     global model_config
@@ -1807,7 +1866,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
         disk_layers = args.breakmodel_disklayers
     if breakmodel_args_default_to_cpu and disk_layers is None:
         disk_layers = args.breakmodel_disklayers = 0
-    
+
     unload_model()
     
     if online_model == "":
@@ -6472,7 +6531,7 @@ def UI_2_load_model(data):
     koboldai_vars.model = data['model']
     koboldai_vars.custmodpth = data['path']
     print("loading Model")
-    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
+    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])
 
 #==================================================================#
 # Event triggered when load story is clicked
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 138993dd..daa25e1f 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -31,6 +31,7 @@ dependencies:
     - flask-cors
     - lupa==1.10
     - transformers==4.28.0
+    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate==0.18.0
diff --git a/environments/rocm.yml b/environments/rocm.yml
index e28c86cb..35fc098d 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -30,9 +30,10 @@ dependencies:
     - flask-cors
     - lupa==1.10
     - transformers==4.28.0
+    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
-    - accelerate
+    - accelerate==0.18.0
     - git+https://github.com/VE-FORBRYDERNE/mkultra
     - ansi2html
     - flask_compress
diff --git a/install_requirements.bat b/install_requirements.bat
index 2a4534c1..3b735ddf 100644
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -48,6 +48,8 @@ umamba.exe create -r B:\python\ -n base
 umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
+call B:\python\condabin\activate
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 subst B: /d
 pause
 exit
@@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base
 umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
+call miniconda3\condabin\activate
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
diff --git a/install_requirements.sh b/install_requirements.sh
index 6f0e0dfd..7b5a8d5b 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+
+# Install quant_cuda module for 4-bit
+bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
 exit
 fi
 if [[ $1 = "rocm" ]]; then
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 5088f8b1..86dea169 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1211,7 +1211,7 @@ class system_settings(settings):
                          'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
                          'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 
                          'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
-                         'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states']
+                         'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states']
     settings_name = "system"
     def __init__(self, socketio, koboldai_var):
         self._socketio = socketio
@@ -1305,6 +1305,8 @@ class system_settings(settings):
                     elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2:
                         self.bit_8_available = True
                         break
+        # Check if repos/gptq exists for 4-bit mode
+        self.bit_4_available = os.path.isdir("repos/gptq")
         self.seen_messages = []
         
         
diff --git a/repos/gptq b/repos/gptq
new file mode 160000
index 00000000..50b22e2b
--- /dev/null
+++ b/repos/gptq
@@ -0,0 +1 @@
+Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228
diff --git a/requirements.txt b/requirements.txt
index 13774e1f..6407303c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ markdown
 bleach==4.1.0
 sentencepiece
 protobuf
-accelerate
+accelerate==0.18.0
 flask-session==0.4.0
 marshmallow>=3.13
 apispec-webframeworks
diff --git a/static/koboldai.js b/static/koboldai.js
index cce66f80..89ee2ea1 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1472,6 +1472,7 @@ function show_model_menu(data) {
 	document.getElementById("modelurl").classList.add("hidden");
 	document.getElementById("use_gpu_div").classList.add("hidden");
 	document.getElementById("use_8_bit_div").classList.add("hidden");
+	document.getElementById("use_4_bit_div").classList.add("hidden");
 	document.getElementById("modellayers").classList.add("hidden");
 	document.getElementById("oaimodel").classList.add("hidden");
 	var model_layer_bars = document.getElementById('model_layer_bars');
@@ -1646,6 +1647,14 @@ function selected_model_info(data) {
 		document.getElementById("use_8_bit").checked = false;
 	}
 	
+	//hide or unhide 4 bit mode
+	if (data.bit_4_available) {
+		document.getElementById("use_4_bit_div").classList.remove("hidden");
+	} else {
+		document.getElementById("use_4_bit_div").classList.add("hidden");
+		document.getElementById("use_4_bit").checked = false;
+	}
+
 	//default URL loading
 	if (data.default_url != null) {
 		document.getElementById("modelurl").value = data.default_url;
@@ -1815,7 +1824,7 @@ function selected_model_info(data) {
 	}
 	accept.disabled = false;
 	
-	
+	set_4_bit_mode(invert=false);
 }
 
 function update_gpu_layers() {
@@ -1876,7 +1885,8 @@ function load_model() {
 			   'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 
 			   'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 
 			   'online_model': selected_models,
-			   'use_8_bit': document.getElementById('use_8_bit').checked};
+			   'use_8_bit': document.getElementById('use_8_bit').checked,
+			   'use_4_bit': document.getElementById('use_4_bit').checked};
 	socket.emit("load_model", message);
 	closePopups();
 }
@@ -3160,6 +3170,15 @@ function save_preset() {
 	closePopups();
 }
 
+function set_4_bit_mode(invert=true) {
+	bit_4_status = document.getElementById("use_4_bit").checked;
+	if (invert) {
+		bit_4_status = !bit_4_status;
+	}
+}
+
+
+
 //--------------------------------------------General UI Functions------------------------------------
 function set_ui_level(level) {
 	for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) {
@@ -7301,4 +7320,4 @@ $el("#gamescreen").addEventListener("paste", function(event) {
 		false,
 		event.clipboardData.getData("text/plain")
 	);
-});
\ No newline at end of file
+});
diff --git a/templates/popups.html b/templates/popups.html
index 44cf7cb6..804b1b9f 100644
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -75,6 +75,10 @@
 				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
 				<div class="box-label">Use 8 bit mode</div>
 			</div>
+			<div class="box flex-push-right hidden" id=use_4_bit_div>
+				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
+				<div class="box-label">Use 4 bit mode</div>
+			</div>
 			<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
 			<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
 		</div>
@@ -402,4 +406,4 @@
 	</div>
 </div>
 
-<div id="notification-container"></div>
\ No newline at end of file
+<div id="notification-container"></div>