From 88d4dc88b9b63a28364c3fad0171ee23eaa766fa Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Fri, 6 Oct 2023 09:40:41 -0400
Subject: [PATCH 01/13] Enhancements to auto-memory test. Seems to be more
 coherent.

---
 aiserver.py          | 70 ++++++++++++++++++++++++++------------------
 koboldai_settings.py |  5 ++--
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 0f064872..3b167462 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1474,7 +1474,7 @@ def general_startup(override_args=None):
     parser.add_argument("--cacheonly", action='store_true', help="Does not save the model to the models folder when it has been downloaded in the cache")
     parser.add_argument("--customsettings", help="Preloads arguements from json file. You only need to provide the location of the json file. Use customsettings.json template file. It can be renamed if you wish so that you can store multiple configurations. Leave any settings you want as default as null. Any values you wish to set need to be in double quotation marks")
     parser.add_argument("--no_ui", action='store_true', default=False, help="Disables the GUI and Socket.IO server while leaving the API server running.")
-    parser.add_argument("--summarizer_model", action='store', default="philschmid/bart-large-cnn-samsum", help="Huggingface model to use for summarization. Defaults to sshleifer/distilbart-cnn-12-6")
+    parser.add_argument("--summarizer_model", action='store', default="pszemraj/led-large-book-summary", help="Huggingface model to use for summarization. Defaults to pszemraj/led-large-book-summary")
     parser.add_argument("--max_summary_length", action='store', default=75, help="Maximum size for summary to send to image generation")
     parser.add_argument("--multi_story", action='store_true', default=False, help="Allow multi-story mode (experimental)")
     parser.add_argument("--peft", type=str, help="Specify the path or HuggingFace ID of a Peft to load it. Not supported on TPU. (Experimental)") 
@@ -7637,16 +7637,19 @@ def get_items_locations_from_text(text):
 #==================================================================#
 def summarize(text, max_length=100, min_length=30, unload=True):
     from transformers import pipeline as summary_pipeline
+    from transformers import AutoConfig
     start_time = time.time()
     if koboldai_vars.summarizer is None:
         if os.path.exists("functional_models/{}".format(args.summarizer_model.replace('/', '_'))):
             koboldai_vars.summary_tokenizer = AutoTokenizer.from_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), cache_dir="cache")
             koboldai_vars.summarizer = AutoModelForSeq2SeqLM.from_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), cache_dir="cache")
+            koboldai_vars.summary_model_config = AutoConfig.from_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), cache_dir="cache")
         else:
             koboldai_vars.summary_tokenizer = AutoTokenizer.from_pretrained(args.summarizer_model, cache_dir="cache")
             koboldai_vars.summarizer = AutoModelForSeq2SeqLM.from_pretrained(args.summarizer_model, cache_dir="cache")
             koboldai_vars.summary_tokenizer.save_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), max_shard_size="500MiB")
             koboldai_vars.summarizer.save_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), max_shard_size="500MiB")
+            koboldai_vars.summary_model_config = AutoConfig.from_pretrained(args.summarizer_model, cache_dir="cache")
 
     #Try GPU accel
     if koboldai_vars.hascuda and torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved(0) >= 1645778560:
@@ -7660,9 +7663,27 @@ def summarize(text, max_length=100, min_length=30, unload=True):
     #Actual sumarization
     start_time = time.time()
     #make sure text is less than 1024 tokens, otherwise we'll crash
-    if len(koboldai_vars.summary_tokenizer.encode(text)) > 1000:
-        text = koboldai_vars.summary_tokenizer.decode(koboldai_vars.summary_tokenizer.encode(text)[:1000])
-    output = tpool.execute(summarizer, text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
+    max_tokens = koboldai_vars.summary_model_config.max_encoder_position_embeddings if hasattr(koboldai_vars.summary_model_config, 'max_encoder_position_embeddings') else 1024
+    logger.info("Using max summary tokens of {}".format(max_tokens))
+    if len(koboldai_vars.summary_tokenizer.encode(text)) > max_tokens:
+        text_list = koboldai_vars.actions.sentence_re.findall(text)
+        i=0
+        while i <= len(text_list)-2:
+            if len(koboldai_vars.summary_tokenizer.encode(text_list[i] + text_list[i+1])) < max_tokens:
+                text_list[i] = text_list[i] + text_list[i+1]
+                del text_list[i+1]
+            else:
+                i+=1
+                    
+        
+    else:
+        text_list = [text]
+    
+    output = []
+    logger.info("Summarizing with {} chunks of length {}".format(len(text_list), [len(koboldai_vars.summary_tokenizer.encode(x)) for x in text_list]))
+    for text in text_list:
+        output.append(tpool.execute(summarizer, text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text'])
+    output = " ".join(output)
     logger.debug("Time to summarize: {}".format(time.time()-start_time))
     #move model back to CPU to save precious vram
     torch.cuda.empty_cache()
@@ -7682,40 +7703,33 @@ def summarize(text, max_length=100, min_length=30, unload=True):
 @socketio.on("refresh_auto_memory")
 @logger.catch
 def UI_2_refresh_auto_memory(data):
+    max_output_length=500
+    from transformers import AutoConfig
     koboldai_vars.auto_memory = "Generating..."
     if koboldai_vars.summary_tokenizer is None:
-        if os.path.exists("models/{}".format(args.summarizer_model.replace('/', '_'))):
-            koboldai_vars.summary_tokenizer = AutoTokenizer.from_pretrained("models/{}".format(args.summarizer_model.replace('/', '_')), cache_dir="cache")
+        if os.path.exists("functional_models/{}".format(args.summarizer_model.replace('/', '_'))):
+            koboldai_vars.summary_tokenizer = AutoTokenizer.from_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), cache_dir="cache")
+            koboldai_vars.summary_model_config = AutoConfig.from_pretrained("functional_models/{}".format(args.summarizer_model.replace('/', '_')), cache_dir="cache")
         else:
             koboldai_vars.summary_tokenizer = AutoTokenizer.from_pretrained(args.summarizer_model, cache_dir="cache")
-    #first, let's get all of our game text and split it into sentences
-    sentences = [x[0] for x in koboldai_vars.actions.to_sentences()]
-    sentences_lengths = [len(koboldai_vars.summary_tokenizer.encode(x)) for x in sentences]
+            koboldai_vars.summary_model_config = AutoConfig.from_pretrained(args.summarizer_model, cache_dir="cache")
+    max_tokens = koboldai_vars.summary_model_config.max_encoder_position_embeddings if hasattr(koboldai_vars.summary_model_config, 'max_encoder_position_embeddings') else 1024
+
+    #first, let's get all of our game text
+    sentences = "".join([x[0] for x in koboldai_vars.actions.to_sentences()])
     
     pass_number = 1
-    while len(koboldai_vars.summary_tokenizer.encode("".join(sentences))) > 1000:
-        #Now let's split them into 1000 token chunks
-        summary_chunks = [""]
-        summary_chunk_lengths = [0]
-        for i in range(len(sentences)):
-            if summary_chunk_lengths[-1] + sentences_lengths[i] <= 1000:
-                summary_chunks[-1] += sentences[i]
-                summary_chunk_lengths[-1] += sentences_lengths[i]
-            else:
-                summary_chunks.append(sentences[i])
-                summary_chunk_lengths.append(sentences_lengths[i])
-        new_sentences = []
-        i=0
-        for summary_chunk in summary_chunks:
-            logger.debug("summarizing chunk {}".format(i))
-            new_sentences.extend(re.split("(?<=[.!?])\s+", summarize(summary_chunk, unload=False)))
-            i+=1
+    while len(koboldai_vars.summary_tokenizer.encode(sentences)) > max_tokens:
+        new_sentences = summarize(sentences, unload=False, max_length=max_output_length)
         logger.debug("Pass {}:\nSummarized to {} sentencees from {}".format(pass_number, len(new_sentences), len(sentences)))
         sentences = new_sentences
-        koboldai_vars.auto_memory += "Pass {}:\n{}\n\n".format(pass_number, "\n".join(sentences))
+        koboldai_vars.auto_memory += "Pass {}:\n{}\n\n".format(pass_number, sentences)
         pass_number+=1
     logger.debug("OK, doing final summarization")
-    output = summarize(" ".join(sentences))
+    if len(koboldai_vars.summary_tokenizer.encode(sentences)) > max_output_length:
+        output = summarize(sentences, max_length=max_output_length)
+    else:
+        output = sentences
     koboldai_vars.auto_memory += "\n\n Final Result:\n" + output
 
 
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 159031ea..afd7efc8 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1247,9 +1247,9 @@ class undefined_settings(settings):
 class system_settings(settings):
     local_only_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 
                             'lua_koboldcore', 'regex_sl', 'acregex_ai', 'acregex_ui', 'comregex_ai', 'comregex_ui',
-                            'sp', '_horde_pid', 'inference_config', 'image_pipeline', 
+                            'sp', '_horde_pid', 'inference_config', 'image_pipeline', 'summary_model_config',
                             'summarizer', 'summary_tokenizer', 'tts_model', 'rng_states', 'comregex_ai', 'comregex_ui', 'colab_arg']
-    no_save_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 
+    no_save_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'summary_model_config',
                          'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
                          'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'on_colab'
                          'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
@@ -1334,6 +1334,7 @@ class system_settings(settings):
         self.image_pipeline = None
         self.summarizer = None
         self.summary_tokenizer = None
+        self.summary_model_config = {}
         self.keep_img_gen_in_memory = False
         self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost
         self.experimental_features = False

From 334eec61274909ac7085c6f880be59463487f755 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Thu, 12 Oct 2023 20:41:20 -0400
Subject: [PATCH 02/13] Fix for text streaming not scrolling properly.

---
 static/koboldai.js | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/static/koboldai.js b/static/koboldai.js
index 3762be6c..016d87e1 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -3602,6 +3602,7 @@ function stream_tokens(tokens) {
 
 			streamBuffer.textContent += streaming.buffer[0];
 			streaming.buffer = streaming.buffer.slice(1);
+			streamBuffer.scrollIntoView({ block: "end" });
 		}
 
 		streaming.typeyTimeout = setTimeout(_char, 10);
@@ -3621,6 +3622,7 @@ function stream_tokens(tokens) {
 		streaming.buffer += tokens[0];
 	} else {
 		streamBuffer.textContent += tokens[0];
+		streamBuffer.scrollIntoView({ block: "end" });
 	}
 }
 

From cbbcc6250e3d5dd678763973dbcad2f5c55a31c5 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Thu, 12 Oct 2023 20:42:42 -0400
Subject: [PATCH 03/13] Fix for exllama (v1 and v2) showing 2x status (0-200%)
 on generation

---
 modeling/inference_models/exllama/class.py   | 3 ++-
 modeling/inference_models/exllamav2/class.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index f688d611..569f6d61 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -340,7 +340,8 @@ class model_backend(InferenceModel):
 
             self._post_token_gen(self.generator.sequence)
 
-            utils.koboldai_vars.generated_tkns += 1
+            #This is taken care of in the core stopper class that's called below. If you're not using core stoppers then it should remain here
+            #utils.koboldai_vars.generated_tkns += 1
 
             # Apply stoppers
             do_stop = False
diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py
index 15b91c8d..dd97e83f 100644
--- a/modeling/inference_models/exllamav2/class.py
+++ b/modeling/inference_models/exllamav2/class.py
@@ -315,7 +315,8 @@ class model_backend(InferenceModel):
 
             self._post_token_gen(self.generator.sequence_ids)
 
-            utils.koboldai_vars.generated_tkns += 1
+            #This is taken care of in the core stopper class that's called below. If you're not using core stoppers then it should remain here
+            #utils.koboldai_vars.generated_tkns += 1
 
             # Apply stoppers
             do_stop = False

From d5dd24a3303c6f85360eff5a771d8d2c102bf779 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Thu, 12 Oct 2023 21:04:00 -0400
Subject: [PATCH 04/13] Added setting saving for exllama and exllamav2

---
 modeling/inference_models/exllama/class.py   | 32 +++++++++++++++++---
 modeling/inference_models/exllamav2/class.py | 31 ++++++++++++++++---
 2 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 569f6d61..b52c2d65 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -375,6 +375,12 @@ class model_backend(InferenceModel):
         return tokenizer
 
     def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        saved_data = {'layers': [], 'max_ctx': 2048, 'compress_emb': 1, 'ntk_alpha': 1}
+        if os.path.exists("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+            with open("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                temp = json.load(f)
+                for key in temp:
+                    saved_data[key] = temp[key]
         requested_parameters = []
         gpu_count = torch.cuda.device_count()
         layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
@@ -401,7 +407,7 @@ class model_backend(InferenceModel):
                                             "step": 1,
                                             "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
                                             "check_message": "The sum of assigned layers must equal {}".format(layer_count),
-                                            "default": [layer_count if i == 0 else 0],
+                                            "default": saved_data['layers'][i] if len(saved_data['layers']) > i else layer_count if i==0 else 0,
                                             "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
                                             "menu_path": "Layers",
                                             "extra_classes": "",
@@ -416,7 +422,7 @@ class model_backend(InferenceModel):
             "min": 2048,
             "max": 16384,
             "step": 512,
-            "default": 2048,
+            "default": saved_data['max_ctx'],
             "tooltip": "The maximum context size the model supports",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -431,7 +437,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 8,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['compress_emb'],
             "tooltip": "If the model requires compressed embeddings, set them here",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -446,7 +452,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 32,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['ntk_alpha'],
             "tooltip": "NTK alpha value",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -491,3 +497,21 @@ class model_backend(InferenceModel):
 
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
         self.path = parameters['path'] if 'path' in parameters else None
+
+    def _save_settings(self):
+        with open(
+            "settings/{}.exllama.model_backend.settings".format(
+                self.model_name.replace("/", "_")
+            ),
+            "w",
+        ) as f:
+            json.dump(
+                {
+                    "layers": self.layers if "layers" in vars(self) else [],
+                    "max_ctx": self.model_config.max_seq_len,
+                    "compress_emb": self.model_config.compress_pos_emb,
+                    "ntk_alpha": self.model_config.alpha_value
+                },
+                f,
+                indent="",
+            )
\ No newline at end of file
diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py
index dd97e83f..7757202f 100644
--- a/modeling/inference_models/exllamav2/class.py
+++ b/modeling/inference_models/exllamav2/class.py
@@ -351,6 +351,12 @@ class model_backend(InferenceModel):
         return tokenizer
 
     def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        saved_data = {'max_ctx': 2048, 'compress_emb': 1, 'ntk_alpha': 1}
+        if os.path.exists("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+            with open("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                temp = json.load(f)
+                for key in temp:
+                    saved_data[key] = temp[key]
         requested_parameters = []
         gpu_count = torch.cuda.device_count()
         layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
@@ -363,7 +369,7 @@ class model_backend(InferenceModel):
             "min": 2048,
             "max": 16384,
             "step": 512,
-            "default": 2048,
+            "default": saved_data['max_ctx'],
             "tooltip": "The maximum context size the model supports",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -378,7 +384,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 8,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['compress_emb'],
             "tooltip": "If the model requires compressed embeddings, set them here",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -393,7 +399,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 32,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['ntk_alpha'],
             "tooltip": "NTK alpha value",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -420,4 +426,21 @@ class model_backend(InferenceModel):
             self.model_config.sdp_thd = 0
 
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
-        self.path = parameters['path'] if 'path' in parameters else None
\ No newline at end of file
+        self.path = parameters['path'] if 'path' in parameters else None
+        
+    def _save_settings(self):
+        with open(
+            "settings/{}.exllamav2.model_backend.settings".format(
+                self.model_name.replace("/", "_")
+            ),
+            "w",
+        ) as f:
+            json.dump(
+                {
+                    "max_ctx": self.model_config.max_seq_len,
+                    "compress_emb": self.model_config.compress_pos_emb,
+                    "ntk_alpha": self.model_config.alpha_value
+                },
+                f,
+                indent="",
+            )
\ No newline at end of file

From 79e951b56487ad66c8e0318759787b13364841e3 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Thu, 12 Oct 2023 21:22:23 -0400
Subject: [PATCH 05/13] Fix for slider bars in model load not setting correctly
 Fixed --quiet being enforced once set forever

---
 koboldai_settings.py | 2 +-
 static/koboldai.js   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/koboldai_settings.py b/koboldai_settings.py
index afd7efc8..53c993ff 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1251,7 +1251,7 @@ class system_settings(settings):
                             'summarizer', 'summary_tokenizer', 'tts_model', 'rng_states', 'comregex_ai', 'comregex_ui', 'colab_arg']
     no_save_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'summary_model_config',
                          'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
-                         'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'on_colab'
+                         'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'on_colab', 'quiet',
                          'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
                          'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states', 'comregex_ai', 'comregex_ui', 'git_repository', 'git_branch', 'colab_arg']
     settings_name = "system"
diff --git a/static/koboldai.js b/static/koboldai.js
index 016d87e1..bb6b558a 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -2008,11 +2008,12 @@ function selected_model_info(sent_data) {
 				slider_number.onchange = function() { document.getElementById(this.id.replace("_text", "")).value = this.value;};
 
 				var slider = new_setting.querySelector('#blank_model_settings_slider');
-				slider.value = item['default'];
 				slider.min = item['min'];
 				slider.max = item['max'];
+				slider.step = item["step"];
 				slider.setAttribute("data_type", item['unit']);
 				slider.id = loader + "|" + item['id'] + "_value";
+				slider.value = item['default'];
 				if ('check' in item) {
 					slider.check_data = item['check'];
 					slider_number.check_data = item['check'];

From c1a96593fdcb7103da6734baab0f5d0c79a9f652 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Fri, 13 Oct 2023 12:32:33 -0400
Subject: [PATCH 06/13] Prevent model loading status from going over 100%

---
 modeling/patches.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modeling/patches.py b/modeling/patches.py
index 5664ec07..01db7b20 100644
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -220,6 +220,7 @@ class LazyloadPatches:
 # BEGIN PATCH
         utils.bar = tqdm(total=len(state_dict), desc="Loading model tensors", file=utils.UIProgressBarFile(), position=1)
         utils.koboldai_vars.total_layers = len(state_dict)
+        utils.koboldai_vars.loaded_layers = 0
 
         for param_name, param in sorted(
             state_dict.items(),

From 21e6d848106f28aade54d9c6a73813f1334cfa09 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Fri, 13 Oct 2023 12:33:00 -0500
Subject: [PATCH 07/13] Add checkpoint tracking for loading

With the index of checkpoint files here (and with the total_size in the
index json) we could probably have a cleaner per-byte loading bar in the
future, but let's not break anything for now.
---
 koboldai_settings.py    |  7 +++++--
 modeling/lazy_loader.py | 25 ++++++++++++++++++++++++-
 modeling/patches.py     |  1 +
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/koboldai_settings.py b/koboldai_settings.py
index 53c993ff..8d644b9d 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -687,8 +687,9 @@ class settings(object):
 
 class model_settings(settings):
     local_only_variables = ['apikey', 'default_preset']
-    no_save_variables = ['modelconfig', 'custmodpth', 'generated_tkns', 
-                         'loaded_layers', 'total_layers', 'total_download_chunks', 'downloaded_chunks', 'presets', 'default_preset', 
+    no_save_variables = ['modelconfig', 'custmodpth', 'generated_tkns',
+                         'loaded_layers', 'total_layers', 'loaded_checkpoints', 'total_checkpoints',
+                         'total_download_chunks', 'downloaded_chunks', 'presets', 'default_preset',
                          'welcome', 'welcome_default', 'simple_randomness', 'simple_creativity', 'simple_repitition',
                          'badwordsids', 'uid_presets', 'model', 'model_type', 'lazy_load', 'fp32_model', 'modeldim', 'horde_wait_time', 'horde_queue_position', 'horde_queue_size', 'newlinemode', 'tqdm_progress', 'tqdm_rem_time', '_tqdm']
     settings_name = "model"
@@ -705,6 +706,8 @@ class model_settings(settings):
         self.generated_tkns = 0    # If using a backend that supports Lua generation modifiers, how many tokens have already been generated, otherwise 0
         self.loaded_layers = 0     # Used in UI 2 to show model loading progress
         self.total_layers = 0      # Same as above
+        self.loaded_checkpoints = 0
+        self.total_checkpoints = 0
         self.total_download_chunks = 0 # tracks how much of the model has downloaded for the UI 2
         self.downloaded_chunks = 0 #as above
         self._tqdm        = tqdm.tqdm(total=self.genamt, file=self.ignore_tqdm())    # tqdm agent for generating tokens. This will allow us to calculate the remaining time
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index eece7d2f..f4ce6fb7 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -52,7 +52,8 @@ import zipfile
 import pickle
 import torch
 import os
-from typing import Any, Callable, Dict, Optional, Tuple, Type
+import json
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 from torch.nn import Module
 from torch.storage import UntypedStorage
@@ -398,6 +399,18 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss
                 if input_name not in self._modules and input_name not in local_state:
                     unexpected_keys.append(key)
 
+def get_sharded_torch_checkpoints(dir: str) -> List[str]:
+    try:
+        with open(os.path.join(dir, "pytorch_model.bin.index.json")) as file:
+            j = json.load(file)
+    except FileNotFoundError:
+        return []
+
+    try:
+        return list(set(j["weight_map"].values()))
+    except KeyError:
+        return []
+
 @contextlib.contextmanager
 def use_lazy_load(
     enable=True,
@@ -410,6 +423,8 @@ def use_lazy_load(
         return
 
     begin_time = time.time()
+    utils.koboldai_vars.total_checkpoints = 0
+    utils.koboldai_vars.loaded_checkpoints = 0
 
     try:
         LazyloadPatches.__enter__()
@@ -421,6 +436,14 @@ def use_lazy_load(
         old_torch_load = torch.load
 
         def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
+            if not utils.koboldai_vars.total_checkpoints:
+                checkpoints = get_sharded_torch_checkpoints(os.path.dirname(f))
+                # `checkpoints` may be empty if there is an error--return 1 in
+                # this case. Either there was no checkpoint index file (most
+                # common case), or there was a compatibility issue while reading
+                # it.
+                utils.koboldai_vars.total_checkpoints = len(checkpoints) or 1
+
             model_dict = old_torch_load(
                 f=f,
                 map_location=map_location,
diff --git a/modeling/patches.py b/modeling/patches.py
index 01db7b20..35bc9eec 100644
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -326,6 +326,7 @@ class LazyloadPatches:
                         fp16_statistics=fp16_statistics,
                     )
 
+        utils.koboldai_vars.loaded_checkpoints += 1
         return error_msgs, offload_index, state_dict_index
 
 

From 2319d1d87b39ea68404ccd667793879bf2106076 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Fri, 13 Oct 2023 14:17:03 -0400
Subject: [PATCH 08/13] Model load progress bar fixed for multi-checkpoint
 models

---
 koboldai_settings.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/koboldai_settings.py b/koboldai_settings.py
index 8d644b9d..9b6277d0 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -707,7 +707,7 @@ class model_settings(settings):
         self.loaded_layers = 0     # Used in UI 2 to show model loading progress
         self.total_layers = 0      # Same as above
         self.loaded_checkpoints = 0
-        self.total_checkpoints = 0
+        self.total_checkpoints = 1
         self.total_download_chunks = 0 # tracks how much of the model has downloaded for the UI 2
         self.downloaded_chunks = 0 #as above
         self._tqdm        = tqdm.tqdm(total=self.genamt, file=self.ignore_tqdm())    # tqdm agent for generating tokens. This will allow us to calculate the remaining time
@@ -832,13 +832,22 @@ class model_settings(settings):
         #Setup TQDP for model loading
         elif name == "loaded_layers" and '_tqdm' in self.__dict__:
             if value == 0:
-                self._tqdm.reset(total=self.total_layers)
+                self._tqdm.reset(total=self.total_layers if self.total_checkpoints == 1 else 1000)
                 self.tqdm_progress = 0
             else:
-                self._tqdm.update(1)
-                self.tqdm_progress = int(float(self.loaded_layers)/float(self.total_layers)*100)
+                if self.total_checkpoints == 1:
+                    self._tqdm.update(1)
+                elif self.total_layers != 0 and self.total_checkpoints != 0:
+                    proper_progress = (self.loaded_checkpoints + value/self.total_layers)/self.total_checkpoints*1000
+                    self._tqdm.update(proper_progress - self._tqdm.n)
+                    
+                self.tqdm_progress = int(float(self._tqdm.n)/float(self._tqdm.total)*100)
+                
                 if self._tqdm.format_dict['rate'] is not None:
-                    self.tqdm_rem_time = str(datetime.timedelta(seconds=int(float(self.total_layers-self.loaded_layers)/self._tqdm.format_dict['rate'])))  
+                    elapsed = self._tqdm.format_dict["elapsed"]
+                    rate = self._tqdm.format_dict["rate"]
+                    remaining = (self._tqdm.total - self._tqdm.n) / rate if rate and self._tqdm.total else 0
+                    self.tqdm_rem_time = str(datetime.timedelta(seconds=remaining))
         #Setup TQDP for model downloading
         elif name == "total_download_chunks" and '_tqdm' in self.__dict__:
             self._tqdm.reset(total=value)

From 7545754c3d2ff254035f849dd82edd6e2a2c1072 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Fri, 13 Oct 2023 20:21:07 -0400
Subject: [PATCH 09/13] Allow clear on images to actually clear past images.

---
 aiserver.py          | 11 ++++++++---
 koboldai_settings.py |  6 ++++++
 static/koboldai.js   |  8 +++++---
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 6962a378..558987ad 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -7585,8 +7585,11 @@ def text2img_api(prompt, art_guide="") -> Image.Image:
 @socketio.on("clear_generated_image")
 @logger.catch
 def UI2_clear_generated_image(data):
-    koboldai_vars.picture = ""
-    koboldai_vars.picture_prompt = ""
+    if 'action_id' in data and data['action_id'] is not None:
+        koboldai_vars.actions.clear_picture(data['action_id'])
+    else:
+        koboldai_vars.picture = ""
+        koboldai_vars.picture_prompt = ""
 
 #==================================================================#
 # Retrieve previous images
@@ -7599,7 +7602,9 @@ def UI_2_get_story_image(data):
     print(filename)
     if filename is not None:
         with open(filename, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode("utf-8") 
+            return {'img': base64.b64encode(image_file.read()).decode("utf-8"), 'action_id': action_id}
+    else:
+        return {'img': None, 'action_id': action_id}
 
 #@logger.catch
 def get_items_locations_from_text(text):
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 9b6277d0..da4f839a 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -2244,6 +2244,12 @@ class KoboldStoryRegister(object):
             self.actions[action_id]['picture_filename'] = filename
             self.actions[action_id]['picture_prompt'] = prompt
     
+    def clear_picture(self, action_id):
+        action_id = int(action_id)
+        if action_id in self.actions:
+            del self.actions[action_id]['picture_filename']
+            del self.actions[action_id]['picture_prompt']
+    
     def get_picture(self, action_id):
         if action_id == -1:
             if self.story_settings.prompt_picture_filename == "":
diff --git a/static/koboldai.js b/static/koboldai.js
index bb6b558a..b15796ab 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -3877,12 +3877,13 @@ function change_image(data) {
 
 	$el("#image-loading").classList.add("hidden");
 
-	if (data != undefined) {
+	if (data.img != undefined) {
 		var image = new Image();
-		image.src = 'data:image/png;base64,'+data;
+		image.src = 'data:image/png;base64,'+data.img;
 		image.classList.add("action_image");
 		image.setAttribute("context-menu", "generated-image");
 		image.addEventListener("click", imgGenView);
+		image.setAttribute('action_id', data.action_id);
 		image_area.appendChild(image);
 	}
 }
@@ -7398,11 +7399,12 @@ function imgGenDownload() {
 function imgGenClear() {
 	const image = $el(".action_image");
 	if (!image) return;
+	action_id = image.getAttribute('action_id');
 	image.remove();
 
 	const container = $el("#action\\ image");
 	container.removeAttribute("tooltip");
-	socket.emit("clear_generated_image", {});
+	socket.emit("clear_generated_image", {'action_id': action_id});
 }
 
 function imgGenRetry() {

From 86b60afaead064a4d4283a88b372f25a83b45373 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Sat, 14 Oct 2023 10:21:03 -0400
Subject: [PATCH 10/13] Missed a pip install for subfolder

---
 install_tortiose_tts.bat | 1 +
 1 file changed, 1 insertion(+)

diff --git a/install_tortiose_tts.bat b/install_tortiose_tts.bat
index 3baf7583..43964bfc 100644
--- a/install_tortiose_tts.bat
+++ b/install_tortiose_tts.bat
@@ -21,6 +21,7 @@ ECHO Runtime launching in subfolder mode
 call miniconda3\condabin\activate
 pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
 pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip install -r requirements.txt --no-dependencies
 cmd /k
 pause
 exit

From 782f3149984d442d941bfd9ba48c3dd2b8dbe27f Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Sat, 14 Oct 2023 10:21:53 -0400
Subject: [PATCH 11/13] merge caused two packages to be on one line

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8d6ac290..3450a803 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -50,4 +50,5 @@ windows-curses; sys_platform == 'win32'
 pynvml
 flash_attn==2.3.0
 xformers==0.0.21
-exllamav2==0.0.4omegaconf
+exllamav2==0.0.4
+omegaconf

From f06069cc797cc4344d8b3398488c7d8f27383aa6 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Sat, 14 Oct 2023 11:06:03 -0400
Subject: [PATCH 12/13] Fix for tortoise tts install script

---
 install_tortiose_tts.bat |  9 ++++++---
 install_tortiose_tts.sh  | 26 +++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/install_tortiose_tts.bat b/install_tortiose_tts.bat
index 43964bfc..d89855d5 100644
--- a/install_tortiose_tts.bat
+++ b/install_tortiose_tts.bat
@@ -21,7 +21,8 @@ ECHO Runtime launching in subfolder mode
 call miniconda3\condabin\activate
 pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
 pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-pip install -r requirements.txt --no-dependencies
+REM pip install -r requirements.txt --no-dependencies
+umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
 cmd /k
 pause
 exit
@@ -33,7 +34,8 @@ subst K: miniconda3 >nul
 call K:\python\condabin\activate
 pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
 pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-pip install -r requirements.txt --no-dependencies
+REM pip install -r requirements.txt --no-dependencies
+umamba.exe install --no-shortcuts -r K:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 cmd /k
 pause
 exit
@@ -45,7 +47,8 @@ subst B: miniconda3 >nul
 call B:\python\condabin\activate
 pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
 pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-pip install -r requirements.txt --no-dependencies
+REM pip install -r requirements.txt --no-dependencies
+umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 cmd /k
 pause
 exit
\ No newline at end of file
diff --git a/install_tortiose_tts.sh b/install_tortiose_tts.sh
index 1a978ab4..46f51e7b 100755
--- a/install_tortiose_tts.sh
+++ b/install_tortiose_tts.sh
@@ -1,4 +1,28 @@
 #!/bin/bash
 bin/micromamba run -r runtime -n koboldai pip install git+https://github.com/neonbjb/tortoise-tts OmegaConf deepspeed
 bin/micromamba run -r runtime -n koboldai pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-bin/micromamba run -r runtime -n koboldai pip install -r requirements.txt --no-dependencies
+
+export PYTHONNOUSERSITE=1
+git submodule update --init --recursive
+if [[ $1 = "cuda" || $1 = "CUDA" ]]; then
+wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
+bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+exit
+fi
+if [[ $1 = "rocm" || $1 = "ROCM" ]]; then
+wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
+# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
+bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
+exit
+fi
+if [[ $1 = "ipex" || $1 = "IPEX" ]]; then
+wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+bin/micromamba create -f environments/ipex.yml -r runtime -n koboldai-ipex -y
+# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
+bin/micromamba create -f environments/ipex.yml -r runtime -n koboldai-ipex -y
+exit
+fi
+echo Please specify either CUDA or ROCM or IPEX

From e946da7d0b6d868d007f183faf08240aa10f2b78 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Sat, 14 Oct 2023 11:28:35 -0400
Subject: [PATCH 13/13] Removed tortoise tts install scripts

---
 install_tortiose_tts.bat | 54 ----------------------------------------
 install_tortiose_tts.sh  | 28 ---------------------
 2 files changed, 82 deletions(-)
 delete mode 100644 install_tortiose_tts.bat
 delete mode 100755 install_tortiose_tts.sh

diff --git a/install_tortiose_tts.bat b/install_tortiose_tts.bat
deleted file mode 100644
index d89855d5..00000000
--- a/install_tortiose_tts.bat
+++ /dev/null
@@ -1,54 +0,0 @@
-@echo off
-cd /D %~dp0
-
-:Isolation
-call conda deactivate 2>NUL
-set Path=%windir%\system32;%windir%;C:\Windows\System32\Wbem;%windir%\System32\WindowsPowerShell\v1.0\;%windir%\System32\OpenSSH\
-SET CONDA_SHLVL=
-SET PYTHONNOUSERSITE=1
-SET PYTHONPATH=
-
-rmdir /S /Q flask_session 2>NUL
-
-TITLE KoboldAI - Server
-SET /P M=<loader.settings
-IF %M%==1 GOTO drivemap
-IF %M%==2 GOTO subfolder
-IF %M%==3 GOTO drivemap_B
-
-:subfolder
-ECHO Runtime launching in subfolder mode
-call miniconda3\condabin\activate
-pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
-pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-REM pip install -r requirements.txt --no-dependencies
-umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
-cmd /k
-pause
-exit
-
-:drivemap
-ECHO Runtime launching in K: drive mode
-subst /D K: >nul
-subst K: miniconda3 >nul
-call K:\python\condabin\activate
-pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
-pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-REM pip install -r requirements.txt --no-dependencies
-umamba.exe install --no-shortcuts -r K:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
-cmd /k
-pause
-exit
-
-:drivemap_B
-ECHO Runtime launching in B: drive mode
-subst /D B: >nul
-subst B: miniconda3 >nul
-call B:\python\condabin\activate
-pip install git+https://github.com/neonbjb/tortoise-tts progressbar inflect librosa rotary-embedding-torch unidecode lazy_loader llvmlite numba joblib decorator audioread msgpack pooch scikit-learn soundfile soxr platformdirs threadpoolctl pydantic-core annotated-types pydantic --no-dependencies
-pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-REM pip install -r requirements.txt --no-dependencies
-umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
-cmd /k
-pause
-exit
\ No newline at end of file
diff --git a/install_tortiose_tts.sh b/install_tortiose_tts.sh
deleted file mode 100755
index 46f51e7b..00000000
--- a/install_tortiose_tts.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-bin/micromamba run -r runtime -n koboldai pip install git+https://github.com/neonbjb/tortoise-tts OmegaConf deepspeed
-bin/micromamba run -r runtime -n koboldai pip install torchaudio --index-url https://download.pytorch.org/whl/cu118
-
-export PYTHONNOUSERSITE=1
-git submodule update --init --recursive
-if [[ $1 = "cuda" || $1 = "CUDA" ]]; then
-wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
-bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
-# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
-exit
-fi
-if [[ $1 = "rocm" || $1 = "ROCM" ]]; then
-wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
-bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
-# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
-exit
-fi
-if [[ $1 = "ipex" || $1 = "IPEX" ]]; then
-wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
-bin/micromamba create -f environments/ipex.yml -r runtime -n koboldai-ipex -y
-# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-bin/micromamba create -f environments/ipex.yml -r runtime -n koboldai-ipex -y
-exit
-fi
-echo Please specify either CUDA or ROCM or IPEX