diff --git a/aiserver.py b/aiserver.py index 21689acd..d872c2fc 100644 --- a/aiserver.py +++ b/aiserver.py @@ -66,6 +66,7 @@ import lupa # KoboldAI import fileops import gensettings +import breakmodel from utils import debounce import utils import koboldai_settings @@ -80,6 +81,7 @@ except: from transformers import GenerationMixin +from model import GenericHFTorchInferenceModel, CustomGPT2HFTorchInferenceModel # Text2img import base64 from PIL import Image @@ -327,23 +329,6 @@ model_menu = { ] } - - -class Send_to_socketio(object): - def write(self, bar): - bar = bar.replace("\r", "").replace("\n", "").replace(chr(0), "") - if bar != "" and [ord(num) for num in bar] != [27, 91, 65]: #No idea why we're getting the 27, 1, 65 character set, just killing to so we can move on - #logger.info(bar) - print('\r' + bar, end='') - time.sleep(0.01) - try: - socketio.emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", " ")}, broadcast=True, room="UI_1") - except: - pass - - def flush(self): - pass - @dataclass class ImportBuffer: # Singleton!!! @@ -969,214 +954,7 @@ def getmodelname(): def get_hidden_size_from_model(model): return model.get_input_embeddings().embedding_dim -#==================================================================# -# Breakmodel configuration functions -#==================================================================# -def device_list(n_layers, primary=None, selected=None): - device_count = torch.cuda.device_count() - if(device_count < 2): - primary = None - gpu_blocks = breakmodel.gpu_blocks + (device_count - len(breakmodel.gpu_blocks))*[0] - print(f"{colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{colors.END}") - for i in range(device_count): - name = torch.cuda.get_device_name(i) - if(len(name) > 47): - name = "..." + name[-44:] - row_color = colors.END - sep_color = colors.YELLOW - print(f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}") - row_color = colors.END - sep_color = colors.YELLOW - if(utils.HAS_ACCELERATE): - print(f"{row_color}{colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){colors.END}") - print(f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}") -def device_config(config): - global breakmodel, generator - import breakmodel - n_layers = utils.num_layers(config) - if args.cpu: - breakmodel.gpu_blocks = [0]*n_layers - return - elif(args.breakmodel_gpulayers is not None or (utils.HAS_ACCELERATE and args.breakmodel_disklayers is not None)): - try: - if(not args.breakmodel_gpulayers): - breakmodel.gpu_blocks = [] - else: - breakmodel.gpu_blocks = list(map(int, args.breakmodel_gpulayers.split(','))) - assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() - s = n_layers - for i in range(len(breakmodel.gpu_blocks)): - if(breakmodel.gpu_blocks[i] <= -1): - breakmodel.gpu_blocks[i] = s - break - else: - s -= breakmodel.gpu_blocks[i] - assert sum(breakmodel.gpu_blocks) <= n_layers - n_layers -= sum(breakmodel.gpu_blocks) - if(args.breakmodel_disklayers is not None): - assert args.breakmodel_disklayers <= n_layers - breakmodel.disk_blocks = args.breakmodel_disklayers - n_layers -= args.breakmodel_disklayers - except: - logger.warning("--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.") - breakmodel.gpu_blocks = [n_layers] - n_layers = 0 - elif(args.breakmodel_layers is not None): - breakmodel.gpu_blocks = [n_layers - max(0, min(n_layers, args.breakmodel_layers))] - n_layers -= sum(breakmodel.gpu_blocks) - elif(args.model is not None): - logger.info("Breakmodel not specified, assuming GPU 0") - breakmodel.gpu_blocks = [n_layers] - n_layers = 0 - else: - device_count = torch.cuda.device_count() - if(device_count > 1): - print(colors.CYAN + "\nPlease select one of your GPUs to be your primary GPU.") - print("VRAM usage in your primary GPU will be higher than for your other ones.") - print("It is recommended you make your fastest GPU your primary GPU.") - device_list(n_layers) - while(True): - primaryselect = input("device ID> ") - if(primaryselect.isnumeric() and 0 <= int(primaryselect) < device_count): - breakmodel.primary_device = int(primaryselect) - break - else: - print(f"{colors.RED}Please enter an integer between 0 and {device_count-1}.{colors.END}") - else: - breakmodel.primary_device = 0 - - print(colors.PURPLE + "\nIf you don't have enough VRAM to run the model on a single GPU") - print("you can split the model between your CPU and your GPU(s), or between") - print("multiple GPUs if you have more than one.") - print("By putting more 'layers' on a GPU or CPU, more computations will be") - print("done on that device and more VRAM or RAM will be required on that device") - print("(roughly proportional to number of layers).") - print("It should be noted that GPUs are orders of magnitude faster than the CPU.") - print(f"This model has{colors.YELLOW} {n_layers} {colors.PURPLE}layers.{colors.END}\n") - - for i in range(device_count): - device_list(n_layers, primary=breakmodel.primary_device, selected=i) - print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n") - while(True): - layerselect = input("# of layers> ") - if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers): - layerselect = int(layerselect) - layerselect = n_layers if layerselect == -1 else layerselect - breakmodel.gpu_blocks.append(layerselect) - n_layers -= layerselect - break - else: - print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}") - if(n_layers == 0): - break - - if(utils.HAS_ACCELERATE and n_layers > 0): - device_list(n_layers, primary=breakmodel.primary_device, selected=-1) - print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n") - while(True): - layerselect = input("# of layers> ") - if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers): - layerselect = int(layerselect) - layerselect = n_layers if layerselect == -1 else layerselect - breakmodel.disk_blocks = layerselect - n_layers -= layerselect - break - else: - print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}") - - logger.init_ok("Final device configuration:", status="Info") - device_list(n_layers, primary=breakmodel.primary_device) - - # If all layers are on the same device, use the old GPU generation mode - while(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0): - breakmodel.gpu_blocks.pop() - if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, utils.num_layers(config))): - koboldai_vars.breakmodel = False - koboldai_vars.usegpu = True - koboldai_vars.gpu_device = len(breakmodel.gpu_blocks)-1 - return - - if(not breakmodel.gpu_blocks): - logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") - import breakmodel - breakmodel.primary_device = "cpu" - koboldai_vars.breakmodel = False - koboldai_vars.usegpu = False - return - -def move_model_to_devices(model): - global generator - - if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel): - if(koboldai_vars.usegpu): - model = model.half().to(koboldai_vars.gpu_device) - else: - model = model.to('cpu').float() - generator = model.generate - return - - import breakmodel - - if(utils.HAS_ACCELERATE): - import accelerate.utils - for key, value in model.state_dict().items(): - target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 - if(value.dtype is not target_dtype): - accelerate.utils.set_module_tensor_to_device(model, key, target_dtype) - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - device_map = {} - for name in utils.layers_module_names: - layer = int(name.rsplit(".", 1)[1]) - device = ("disk" if layer < disk_blocks else "cpu") if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) - device_map[name] = device - for name in utils.get_missing_module_names(model, list(device_map.keys())): - device_map[name] = breakmodel.primary_device - breakmodel.dispatch_model_ex(model, device_map, main_device=breakmodel.primary_device, offload_buffers=True, offload_dir="accelerate-disk-cache") - gc.collect() - generator = model.generate - return - - model.half() - gc.collect() - - if(hasattr(model, "transformer")): - model.transformer.wte.to(breakmodel.primary_device) - model.transformer.ln_f.to(breakmodel.primary_device) - if(hasattr(model, 'lm_head')): - model.lm_head.to(breakmodel.primary_device) - if(hasattr(model.transformer, 'wpe')): - model.transformer.wpe.to(breakmodel.primary_device) - elif(not hasattr(model.model, "decoder")): - model.model.embed_tokens.to(breakmodel.primary_device) - model.model.layer_norm.to(breakmodel.primary_device) - model.lm_head.to(breakmodel.primary_device) - model.model.embed_positions.to(breakmodel.primary_device) - else: - model.model.decoder.embed_tokens.to(breakmodel.primary_device) - if(model.model.decoder.project_in is not None): - model.model.decoder.project_in.to(breakmodel.primary_device) - if(model.model.decoder.project_out is not None): - model.model.decoder.project_out.to(breakmodel.primary_device) - model.model.decoder.embed_positions.to(breakmodel.primary_device) - gc.collect() - GPTNeoModel.forward = breakmodel.new_forward_neo - if("GPTJModel" in globals()): - GPTJModel.forward = breakmodel.new_forward_neo # type: ignore - if("XGLMModel" in globals()): - XGLMModel.forward = breakmodel.new_forward_xglm # type: ignore - if("OPTDecoder" in globals()): - OPTDecoder.forward = breakmodel.new_forward_opt # type: ignore - generator = model.generate - if(hasattr(model, "transformer")): - breakmodel.move_hidden_layers(model.transformer) - elif(not hasattr(model.model, "decoder")): - breakmodel.move_hidden_layers(model.model, model.model.layers) - else: - breakmodel.move_hidden_layers(model.model.decoder, model.model.decoder.layers) #==================================================================# # Allow the models to override some settings @@ -1962,33 +1740,6 @@ def get_cluster_models(msg): emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True) -# Function to patch transformers to use our soft prompt -def patch_causallm(model): - from torch.nn import Embedding - if(getattr(Embedding, "_koboldai_patch_causallm_model", None)): - Embedding._koboldai_patch_causallm_model = model - return model - old_embedding_call = Embedding.__call__ - def new_embedding_call(self, input_ids, *args, **kwargs): - if(Embedding._koboldai_patch_causallm_model.get_input_embeddings() is not self): - return old_embedding_call(self, input_ids, *args, **kwargs) - assert input_ids is not None - if(koboldai_vars.sp is not None): - shifted_input_ids = input_ids - model.config.vocab_size - input_ids.clamp_(max=model.config.vocab_size-1) - inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) - if(koboldai_vars.sp is not None): - koboldai_vars.sp = koboldai_vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device) - inputs_embeds = torch.where( - (shifted_input_ids >= 0)[..., None], - koboldai_vars.sp[shifted_input_ids.clamp(min=0)], - inputs_embeds, - ) - return inputs_embeds - Embedding.__call__ = new_embedding_call - Embedding._koboldai_patch_causallm_model = model - return model - def patch_transformers_download(): global transformers import copy, requests, tqdm, time @@ -2751,44 +2502,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If transformers model was selected & GPU available, ask to use CPU or GPU - if(koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"] and not koboldai_vars.model.startswith("RWKV")): - koboldai_vars.allowsp = True - # Test for GPU support - - # Make model path the same as the model name to make this consistent with the other loading method if it isn't a known model type - # This code is not just a workaround for below, it is also used to make the behavior consistent with other loading methods - Henk717 - if(not koboldai_vars.model in ["NeoCustom", "GPT2Custom"]): - koboldai_vars.custmodpth = koboldai_vars.model - elif(koboldai_vars.model == "NeoCustom"): - koboldai_vars.model = os.path.basename(os.path.normpath(koboldai_vars.custmodpth)) - - # Get the model_type from the config or assume a model type if it isn't present - from transformers import AutoConfig - if(os.path.isdir(koboldai_vars.custmodpth.replace('/', '_'))): - try: - model_config = AutoConfig.from_pretrained(koboldai_vars.custmodpth.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache") - koboldai_vars.model_type = model_config.model_type - except ValueError as e: - koboldai_vars.model_type = "not_found" - elif(os.path.isdir("models/{}".format(koboldai_vars.custmodpth.replace('/', '_')))): - try: - model_config = AutoConfig.from_pretrained("models/{}".format(koboldai_vars.custmodpth.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache") - koboldai_vars.model_type = model_config.model_type - except ValueError as e: - koboldai_vars.model_type = "not_found" - else: - try: - model_config = AutoConfig.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - koboldai_vars.model_type = model_config.model_type - except ValueError as e: - koboldai_vars.model_type = "not_found" - if(koboldai_vars.model_type == "not_found" and koboldai_vars.model == "NeoCustom"): - koboldai_vars.model_type = "gpt_neo" - elif(koboldai_vars.model_type == "not_found" and koboldai_vars.model == "GPT2Custom"): - koboldai_vars.model_type = "gpt2" - elif(koboldai_vars.model_type == "not_found"): - logger.warning("No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") - koboldai_vars.model_type = "gpt_neo" + # if(koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"] and not koboldai_vars.model.startswith("RWKV")): if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): loadmodelsettings() @@ -2893,363 +2607,30 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except: pass - # Lazy loader - import torch_lazy_loader - def get_lazy_load_callback(n_layers, convert_to_float16=True): - if not koboldai_vars.lazy_load: - return - - from tqdm.auto import tqdm - - global breakmodel - import breakmodel - - if utils.HAS_ACCELERATE: - import accelerate.utils - - if args.breakmodel_disklayers is not None: - breakmodel.disk_blocks = args.breakmodel_disklayers - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - def lazy_load_callback(model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], f, **_): - if lazy_load_callback.nested: - return - lazy_load_callback.nested = True - - device_map: Dict[str, Union[str, int]] = {} - - @functools.lru_cache(maxsize=None) - def get_original_key(key): - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) - - for key, value in model_dict.items(): - original_key = get_original_key(key) - if isinstance(value, torch_lazy_loader.LazyTensor) and not any(original_key.startswith(n) for n in utils.layers_module_names): - device_map[key] = koboldai_vars.gpu_device if koboldai_vars.hascuda and koboldai_vars.usegpu else "cpu" if not koboldai_vars.hascuda or not koboldai_vars.breakmodel else breakmodel.primary_device - else: - layer = int(max((n for n in utils.layers_module_names if original_key.startswith(n)), key=len).rsplit(".", 1)[1]) - device = koboldai_vars.gpu_device if koboldai_vars.hascuda and koboldai_vars.usegpu else "disk" if layer < disk_blocks and layer < ram_blocks else "cpu" if not koboldai_vars.hascuda or not koboldai_vars.breakmodel else "shared" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) - device_map[key] = device - - if utils.num_shards is None or utils.current_shard == 0: - utils.offload_index = {} - if utils.HAS_ACCELERATE: - if os.path.isdir("accelerate-disk-cache"): - # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder - # (the folder doesn't contain any subfolders so os.remove will do just fine) - for filename in os.listdir("accelerate-disk-cache"): - try: - os.remove(os.path.join("accelerate-disk-cache", filename)) - except OSError: - pass - os.makedirs("accelerate-disk-cache", exist_ok=True) - if utils.num_shards is not None: - num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) - else: - num_tensors = len(device_map) - print(flush=True) - koboldai_vars.status_message = "Loading model" - koboldai_vars.total_layers = num_tensors - koboldai_vars.loaded_layers = 0 - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio()) - - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0] - f = None - current_offset = 0 - able_to_pin_layers = True - if utils.num_shards is not None: - utils.current_shard += 1 - for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): - storage_key = model_dict[key].key - if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset: - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - try: - f = z.open(f"archive/data/{storage_key}") - except: - f = z.open(f"{zipfolder}/data/{storage_key}") - current_offset = 0 - if current_offset != model_dict[key].seek_offset: - f.read(model_dict[key].seek_offset - current_offset) - current_offset = model_dict[key].seek_offset - device = device_map[key] - size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1) - dtype = model_dict[key].dtype - nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) - #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - if model_dict[key].dtype is torch.float32: - koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) - if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): - model_dict[key] = model_dict[key].to(torch.float32) - if device == "shared": - model_dict[key] = model_dict[key].to("cpu").detach_() - if able_to_pin_layers and utils.HAS_ACCELERATE: - try: - model_dict[key] = model_dict[key].pin_memory() - except: - able_to_pin_layers = False - elif device == "disk": - accelerate.utils.offload_weight(model_dict[key], get_original_key(key), "accelerate-disk-cache", index=utils.offload_index) - model_dict[key] = model_dict[key].to("meta") - else: - model_dict[key] = model_dict[key].to(device) - #print("OK", flush=True) - current_offset += nbytes - utils.bar.update(1) - koboldai_vars.loaded_layers += 1 - finally: - if utils.num_shards is None or utils.current_shard >= utils.num_shards: - if utils.offload_index: - for name, tensor in utils.named_buffers: - dtype = tensor.dtype - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - dtype = torch.float32 - if name in model_dict and model_dict[name].dtype is not dtype: - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) - accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") - utils.bar.close() - utils.bar = None - koboldai_vars.status_message = "" - lazy_load_callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() - - lazy_load_callback.nested = False - return lazy_load_callback - - - def maybe_low_cpu_mem_usage() -> Dict[str, Any]: - if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")): - logger.warning(f"Please upgrade to transformers 4.11.0 for lower RAM usage. You have transformers {transformers_version}.") - return {} - return {"low_cpu_mem_usage": True} - - @contextlib.contextmanager - def maybe_use_float16(always_use=False): - if(always_use or (koboldai_vars.hascuda and args.lowmem and (koboldai_vars.usegpu or koboldai_vars.breakmodel))): - original_dtype = torch.get_default_dtype() - torch.set_default_dtype(torch.float16) - yield True - torch.set_default_dtype(original_dtype) - else: - yield False - - # If custom GPT2 model was chosen - if(koboldai_vars.model_type == "gpt2"): - koboldai_vars.lazy_load = False - if os.path.exists(koboldai_vars.custmodpth): - model_config = json.load(open(koboldai_vars.custmodpth + "/config.json", "r")) - elif os.path.exists(os.path.join("models/", koboldai_vars.custmodpth)): - config_path = os.path.join("models/", koboldai_vars.custmodpth) - config_path = os.path.join(config_path, "config.json").replace("\\", "//") - model_config = json.load(open(config_path, "r")) - with(maybe_use_float16()): - try: - if os.path.exists(koboldai_vars.custmodpth): - model = GPT2LMHeadModel.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - elif os.path.exists(os.path.join("models/", koboldai_vars.custmodpth)): - model = GPT2LMHeadModel.from_pretrained(os.path.join("models/", koboldai_vars.custmodpth), revision=koboldai_vars.revision, cache_dir="cache") - tokenizer = GPT2Tokenizer.from_pretrained(os.path.join("models/", koboldai_vars.custmodpth), revision=koboldai_vars.revision, cache_dir="cache") - else: - model = GPT2LMHeadModel.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - if("out of memory" in traceback.format_exc().lower()): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - raise e - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") - tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) - koboldai_vars.modeldim = get_hidden_size_from_model(model) - # Is CUDA available? If so, use GPU, otherwise fall back to CPU - if(koboldai_vars.hascuda and koboldai_vars.usegpu): - model = model.half().to(koboldai_vars.gpu_device) - generator = model.generate - else: - model = model.to('cpu').float() - generator = model.generate - patch_causallm(model) - # Use the Generic implementation + if koboldai_vars.model_type == "gpt2": + model = CustomGPT2HFTorchInferenceModel( + koboldai_vars.model, + low_mem=args.lowmem + ) + model._load( + save_model=not (args.colab or args.cacheonly) or args.savemodel + ) else: - lowmem = maybe_low_cpu_mem_usage() - # We must disable low_cpu_mem_usage (by setting lowmem to {}) if - # using a GPT-2 model because GPT-2 is not compatible with this - # feature yet - if(koboldai_vars.model_type == "gpt2"): - lowmem = {} - koboldai_vars.lazy_load = False # Also, lazy loader doesn't support GPT-2 models - - # If we're using torch_lazy_loader, we need to get breakmodel config - # early so that it knows where to load the individual model tensors - if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel: - device_config(model_config) - - # Download model from Huggingface if it does not exist, otherwise load locally - - #If we specify a model and it's in the root directory, we need to move it to the models directory (legacy folder structure to new) - if os.path.isdir(koboldai_vars.model.replace('/', '_')): - import shutil - shutil.move(koboldai_vars.model.replace('/', '_'), "models/{}".format(koboldai_vars.model.replace('/', '_'))) - if(koboldai_vars.lazy_load): # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): - try: - metamodel = AutoModelForCausalLM.from_config(model_config) - except Exception as e: - metamodel = GPTNeoForCausalLM.from_config(model_config) - utils.layers_module_names = utils.get_layers_module_names(metamodel) - utils.module_names = list(metamodel.state_dict().keys()) - utils.named_buffers = list(metamodel.named_buffers(recurse=True)) - with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True): - if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - lowmem = {} - if(os.path.isdir(koboldai_vars.custmodpth)): - try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - except Exception as e: - try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - try: - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - except Exception as e: - if("out of memory" in traceback.format_exc().lower()): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): - try: - tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - except Exception as e: - try: - tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - try: - tokenizer = GPT2Tokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - except Exception as e: - if("out of memory" in traceback.format_exc().lower()): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - else: - old_rebuild_tensor = torch._utils._rebuild_tensor - def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): - if(not isinstance(storage, torch_lazy_loader.LazyTensor)): - dtype = storage.dtype - else: - dtype = storage.storage_type.dtype - if(not isinstance(dtype, torch.dtype)): - dtype = storage.storage_type(0).dtype - if(dtype is torch.float32 and len(shape) >= 2): - koboldai_vars.fp32_model = True - return old_rebuild_tensor(storage, storage_offset, shape, stride) - torch._utils._rebuild_tensor = new_rebuild_tensor - - try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - except Exception as e: - try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - try: - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - except Exception as e: - if("out of memory" in traceback.format_exc().lower()): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - - torch._utils._rebuild_tensor = old_rebuild_tensor - - if not (args.colab or args.cacheonly) or args.savemodel: - import shutil - tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) - if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case - model = model.half() - model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") - else: # For fp16 models, we can just copy the model files directly - import transformers.configuration_utils - import transformers.modeling_utils - import transformers.file_utils - import huggingface_hub - legacy = packaging.version.parse(transformers_version) < packaging.version.parse("4.22.0.dev0") - # Save the config.json - shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(koboldai_vars.model, transformers.configuration_utils.CONFIG_NAME, revision=koboldai_vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(koboldai_vars.model.replace('/', '_')), transformers.configuration_utils.CONFIG_NAME)) - if(utils.num_shards is None): - # Save the pytorch_model.bin of an unsharded model - try: - shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(koboldai_vars.model, transformers.modeling_utils.WEIGHTS_NAME, revision=koboldai_vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(koboldai_vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_NAME)) - except: - shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(koboldai_vars.model, "model.safetensors", revision=koboldai_vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(koboldai_vars.model.replace('/', '_')), "model.safetensors")) - else: - with open(utils.from_pretrained_index_filename) as f: - map_data = json.load(f) - filenames = set(map_data["weight_map"].values()) - # Save the pytorch_model.bin.index.json of a sharded model - shutil.move(os.path.realpath(utils.from_pretrained_index_filename), os.path.join("models/{}".format(koboldai_vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_INDEX_NAME)) - # Then save the pytorch_model-#####-of-#####.bin files - for filename in filenames: - shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(koboldai_vars.model, filename, revision=koboldai_vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(koboldai_vars.model.replace('/', '_')), filename)) - shutil.rmtree("cache/") - - if(koboldai_vars.badwordsids is koboldai_settings.badwordsids_default and koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")): - koboldai_vars.badwordsids = [[v] for k, v in tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]") if koboldai_vars.newlinemode != "s" or str(k) != ""] - - patch_causallm(model) - - if(koboldai_vars.hascuda): - if(koboldai_vars.usegpu): - koboldai_vars.modeldim = get_hidden_size_from_model(model) - model = model.half().to(koboldai_vars.gpu_device) - generator = model.generate - elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) - koboldai_vars.modeldim = get_hidden_size_from_model(model) - if(not koboldai_vars.lazy_load): - device_config(model.config) - move_model_to_devices(model) - elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): - move_model_to_devices(model) - koboldai_vars.modeldim = get_hidden_size_from_model(model) - generator = model.generate - else: - model = model.to('cpu').float() - koboldai_vars.modeldim = get_hidden_size_from_model(model) - generator = model.generate - elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): - move_model_to_devices(model) - koboldai_vars.modeldim = get_hidden_size_from_model(model) - generator = model.generate - else: - model.to('cpu').float() - koboldai_vars.modeldim = get_hidden_size_from_model(model) - generator = model.generate + model = GenericHFTorchInferenceModel( + koboldai_vars.model, + lazy_load=koboldai_vars.lazy_load, + low_mem=args.lowmem + ) + model._load( + save_model=not (args.colab or args.cacheonly) or args.savemodel + ) + # TODO: Convert everywhere to use model.tokenizer + tokenizer = model.tokenizer + print("Cool") + # Use the Generic implementation + # END + # Suppress Author's Note by flagging square brackets (Old implementation) #vocab = tokenizer.get_vocab() #vocab_keys = vocab.keys() @@ -5492,7 +4873,7 @@ def core_generate(text: list, _min: int, _max: int, found_entries: set, is_core: found_entries = found_entries or set() if model: - model.kai_scanner_excluded_world_info = found_entries + model.model.kai_scanner_excluded_world_info = found_entries koboldai_vars._prompt = koboldai_vars.prompt @@ -5833,7 +5214,8 @@ def torch_raw_generate( with torch.no_grad(): start_time = time.time() - genout = generator( + # HACK: raw_generate functions should be in the model itself + genout = model.model.generate( gen_in, do_sample=True, max_length=min(len(prompt_tokens) + max_new, koboldai_vars.max_length), diff --git a/model.py b/model.py new file mode 100644 index 00000000..3acb458d --- /dev/null +++ b/model.py @@ -0,0 +1,950 @@ +# TODO: +# - Intertwine stoppers and streaming and such +# - Add raw_generate functions to this +# - Support TPU +# - Support APIs +# - Support RWKV + +import bisect +import gc +import shutil +import contextlib +import functools +import itertools +import json +import os +import traceback +import zipfile +import utils +import breakmodel + +import torch +from torch.nn import Embedding + +from tqdm.auto import tqdm +from logger import logger +import torch_lazy_loader +from typing import Dict, List, Optional, Union +from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification, AutoConfig + +# Previously under condition HAS_ACCELERATE, but I'm quite sure accelerate +# is now a dependency. +import accelerate.utils + +import koboldai_settings + +class InferenceModel: + def __init__(self) -> None: + self.gen_config = {} + self.token_gen_hooks = [] + + def generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new_tokens: int, + do_streaming: bool = False, + do_dynamic_wi: bool = False, + single_line: bool = False, + batch_count: int = 1, + ) -> torch.Tensor: + raise NotImplementedError("generate() was not overridden") + + def _post_token_gen(self, input_ids: torch.LongTensor) -> None: + for hook in self.token_gen_hooks: + hook(input_ids) + + +class HFTorchInferenceModel: + def __init__( + self, + model_name: str, + lazy_load: bool, + low_mem: bool, + ) -> None: + super().__init__() + + self.model_name = model_name + self.lazy_load = lazy_load + self.low_mem = low_mem + + self.model = None + self.tokenizer = None + self.model_config = None + + def generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new_tokens: int, + do_streaming: bool = False, + do_dynamic_wi: bool = False, + single_line: bool = False, + batch_count: int = 1, + ) -> torch.Tensor: + raise NotImplementedError("AHHHH") + + self.gen_config = { + "do_streaming": do_streaming, + "do_dynamic_wi": do_dynamic_wi, + "stop_at_genamt": do_dynamic_wi, + } + + def _get_model(self, location: str, tf_kwargs: Dict): + try: + return AutoModelForCausalLM.from_pretrained( + location, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + **tf_kwargs + ) + except Exception as e: + if "out of memory" in traceback.format_exc().lower(): + raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + return GPTNeoForCausalLM.from_pretrained( + location, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + **tf_kwargs + ) + + def _get_tokenizer(self, location: str): + std_kwargs = {"revision": utils.koboldai_vars.revision, "cache_dir": "cache"} + + suppliers = [ + # Fast tokenizer disabled by default as per HF docs: + # > Note: Make sure to pass use_fast=False when loading + # OPT’s tokenizer with AutoTokenizer to get the correct + # tokenizer. + lambda: AutoTokenizer.from_pretrained(location, use_fast=False, **std_kwargs), + lambda: AutoTokenizer.from_pretrained(location, **std_kwargs), + + # Fallback to GPT2Tokenizer + lambda: GPT2Tokenizer.from_pretrained(location, **std_kwargs), + lambda: GPT2Tokenizer.from_pretrained("gpt2", **std_kwargs), + ] + + for i, try_get_tokenizer in enumerate(suppliers): + try: + return try_get_tokenizer() + except Exception as e: + # If we error on each attempt, raise the last one + if i == len(suppliers) - 1: + raise e + + def get_local_model_path( + self, + legacy: bool = False, + ignore_existance: bool = False + ) -> Optional[str]: + """ + Returns a string of the model's path locally, or None if it is not downloaded. + If ignore_existance is true, it will always return a path. + """ + + basename = utils.koboldai_vars.model.replace("/", "_") + if legacy: + ret = basename + else: + ret = os.path.join("models", basename) + + if os.path.isdir(ret) or ignore_existance: + return ret + return None + + + def get_hidden_size(self) -> int: + return self.model.get_input_embeddings().embedding_dim + + + def _move_to_devices(self) -> None: + if not utils.koboldai_vars.breakmodel: + if utils.koboldai_vars.usegpu: + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + else: + self.model = self.model.to('cpu').float() + return + + for key, value in self.model.state_dict().items(): + target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 + if value.dtype is not target_dtype: + accelerate.utils.set_module_tensor_to_device(self.model, key, target_dtype) + + disk_blocks = breakmodel.disk_blocks + gpu_blocks = breakmodel.gpu_blocks + ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) + device_map = {} + + for name in utils.layers_module_names: + layer = int(name.rsplit(".", 1)[1]) + device = ("disk" if layer < disk_blocks else "cpu") if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + device_map[name] = device + + for name in utils.get_missing_module_names(self.model, list(device_map.keys())): + device_map[name] = breakmodel.primary_device + + breakmodel.dispatch_model_ex( + self.model, + device_map, + main_device=breakmodel.primary_device, + offload_buffers=True, + offload_dir="accelerate-disk-cache" + ) + + gc.collect() + return + + # == Old non-accelerate stuff + # model.half() + # gc.collect() + + # if(hasattr(model, "transformer")): + # model.transformer.wte.to(breakmodel.primary_device) + # model.transformer.ln_f.to(breakmodel.primary_device) + # if(hasattr(model, 'lm_head')): + # model.lm_head.to(breakmodel.primary_device) + # if(hasattr(model.transformer, 'wpe')): + # model.transformer.wpe.to(breakmodel.primary_device) + # elif(not hasattr(model.model, "decoder")): + # model.model.embed_tokens.to(breakmodel.primary_device) + # model.model.layer_norm.to(breakmodel.primary_device) + # model.lm_head.to(breakmodel.primary_device) + # model.model.embed_positions.to(breakmodel.primary_device) + # else: + # model.model.decoder.embed_tokens.to(breakmodel.primary_device) + # if(model.model.decoder.project_in is not None): + # model.model.decoder.project_in.to(breakmodel.primary_device) + # if(model.model.decoder.project_out is not None): + # model.model.decoder.project_out.to(breakmodel.primary_device) + # model.model.decoder.embed_positions.to(breakmodel.primary_device) + # gc.collect() + # GPTNeoModel.forward = breakmodel.new_forward_neo + # if("GPTJModel" in globals()): + # GPTJModel.forward = breakmodel.new_forward_neo # type: ignore + # if("XGLMModel" in globals()): + # XGLMModel.forward = breakmodel.new_forward_xglm # type: ignore + # if("OPTDecoder" in globals()): + # OPTDecoder.forward = breakmodel.new_forward_opt # type: ignore + # generator = model.generate + # if(hasattr(model, "transformer")): + # breakmodel.move_hidden_layers(model.transformer) + # elif(not hasattr(model.model, "decoder")): + # breakmodel.move_hidden_layers(model.model, model.model.layers) + # else: + # breakmodel.move_hidden_layers(model.model.decoder, model.model.decoder.layers) + + # Function to patch transformers to use our soft prompt + def patch_embedding(self) -> None: + if getattr(Embedding, "_koboldai_patch_causallm_model", None): + Embedding._koboldai_patch_causallm_model = self.model + return + + old_embedding_call = Embedding.__call__ + + kai_model = self + def new_embedding_call(self, input_ids, *args, **kwargs): + # Don't touch embeddings for models other than the core inference model (that's us!) + if Embedding._koboldai_patch_causallm_model.get_input_embeddings() is not self: + return old_embedding_call(self, input_ids, *args, **kwargs) + + assert input_ids is not None + + if utils.koboldai_vars.sp is not None: + shifted_input_ids = input_ids - kai_model.model.config.vocab_size + + input_ids.clamp_(max=kai_model.model.config.vocab_size - 1) + inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) + + if utils.koboldai_vars.sp is not None: + utils.koboldai_vars.sp = utils.koboldai_vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device) + inputs_embeds = torch.where( + (shifted_input_ids >= 0)[..., None], + utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)], + inputs_embeds, + ) + + return inputs_embeds + + Embedding.__call__ = new_embedding_call + Embedding._koboldai_patch_causallm_model = self.model + + + def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): + if not self.lazy_load: + return + + if utils.args.breakmodel_disklayers is not None: + breakmodel.disk_blocks = utils.args.breakmodel_disklayers + + disk_blocks = breakmodel.disk_blocks + gpu_blocks = breakmodel.gpu_blocks + ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) + + def lazy_load_callback( + model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], + f, + **_, + ): + if lazy_load_callback.nested: + return + lazy_load_callback.nested = True + + device_map: Dict[str, Union[str, int]] = {} + + @functools.lru_cache(maxsize=None) + def get_original_key(key): + return max( + ( + original_key + for original_key in utils.module_names + if original_key.endswith(key) + ), + key=len, + ) + + for key, value in model_dict.items(): + original_key = get_original_key(key) + if isinstance(value, torch_lazy_loader.LazyTensor) and not any( + original_key.startswith(n) for n in utils.layers_module_names + ): + device_map[key] = ( + utils.koboldai_vars.gpu_device + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu + else "cpu" + if not utils.koboldai_vars.hascuda or not utils.koboldai_vars.breakmodel + else breakmodel.primary_device + ) + else: + layer = int( + max( + ( + n + for n in utils.layers_module_names + if original_key.startswith(n) + ), + key=len, + ).rsplit(".", 1)[1] + ) + device = ( + utils.koboldai_vars.gpu_device + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu + else "disk" + if layer < disk_blocks and layer < ram_blocks + else "cpu" + if not utils.koboldai_vars.hascuda or not utils.koboldai_vars.breakmodel + else "shared" + if layer < ram_blocks + else bisect.bisect_right( + cumulative_gpu_blocks, layer - ram_blocks + ) + ) + device_map[key] = device + + if utils.num_shards is None or utils.current_shard == 0: + utils.offload_index = {} + if utils.HAS_ACCELERATE: + if os.path.isdir("accelerate-disk-cache"): + # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder + # (the folder doesn't contain any subfolders so os.remove will do just fine) + for filename in os.listdir("accelerate-disk-cache"): + try: + os.remove( + os.path.join("accelerate-disk-cache", filename) + ) + except OSError: + pass + os.makedirs("accelerate-disk-cache", exist_ok=True) + if utils.num_shards is not None: + num_tensors = len( + utils.get_sharded_checkpoint_num_tensors( + utils.from_pretrained_model_name, + utils.from_pretrained_index_filename, + **utils.from_pretrained_kwargs, + ) + ) + else: + num_tensors = len(device_map) + print(flush=True) + utils.koboldai_vars.status_message = "Loading model" + utils.koboldai_vars.total_layers = num_tensors + utils.koboldai_vars.loaded_layers = 0 + utils.bar = tqdm( + total=num_tensors, + desc="Loading model tensors", + file=utils.UIProgressBarFile(), + ) + + with zipfile.ZipFile(f, "r") as z: + try: + last_storage_key = None + zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0] + f = None + current_offset = 0 + able_to_pin_layers = True + if utils.num_shards is not None: + utils.current_shard += 1 + for key in sorted( + device_map.keys(), + key=lambda k: (model_dict[k].key, model_dict[k].seek_offset), + ): + storage_key = model_dict[key].key + if ( + storage_key != last_storage_key + or model_dict[key].seek_offset < current_offset + ): + last_storage_key = storage_key + if isinstance(f, zipfile.ZipExtFile): + f.close() + try: + f = z.open(f"archive/data/{storage_key}") + except: + f = z.open(f"{zipfolder}/data/{storage_key}") + current_offset = 0 + if current_offset != model_dict[key].seek_offset: + f.read(model_dict[key].seek_offset - current_offset) + current_offset = model_dict[key].seek_offset + device = device_map[key] + size = functools.reduce( + lambda x, y: x * y, model_dict[key].shape, 1 + ) + dtype = model_dict[key].dtype + nbytes = ( + size + if dtype is torch.bool + else size + * ( + ( + torch.finfo + if dtype.is_floating_point + else torch.iinfo + )(dtype).bits + >> 3 + ) + ) + # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) + model_dict[key] = model_dict[key].materialize( + f, map_location="cpu" + ) + if model_dict[key].dtype is torch.float32: + utils.koboldai_vars.fp32_model = True + if ( + convert_to_float16 + and breakmodel.primary_device != "cpu" + and utils.koboldai_vars.hascuda + and (utils.koboldai_vars.breakmodel or utils.koboldai_vars.usegpu) + and model_dict[key].dtype is torch.float32 + ): + model_dict[key] = model_dict[key].to(torch.float16) + if breakmodel.primary_device == "cpu" or ( + not utils.koboldai_vars.usegpu + and not utils.koboldai_vars.breakmodel + and model_dict[key].dtype is torch.float16 + ): + model_dict[key] = model_dict[key].to(torch.float32) + if device == "shared": + model_dict[key] = model_dict[key].to("cpu").detach_() + if able_to_pin_layers and utils.HAS_ACCELERATE: + try: + model_dict[key] = model_dict[key].pin_memory() + except: + able_to_pin_layers = False + elif device == "disk": + accelerate.utils.offload_weight( + model_dict[key], + get_original_key(key), + "accelerate-disk-cache", + index=utils.offload_index, + ) + model_dict[key] = model_dict[key].to("meta") + else: + model_dict[key] = model_dict[key].to(device) + # print("OK", flush=True) + current_offset += nbytes + utils.bar.update(1) + utils.koboldai_vars.loaded_layers += 1 + finally: + if ( + utils.num_shards is None + or utils.current_shard >= utils.num_shards + ): + if utils.offload_index: + for name, tensor in utils.named_buffers: + dtype = tensor.dtype + if ( + convert_to_float16 + and breakmodel.primary_device != "cpu" + and utils.koboldai_vars.hascuda + and ( + utils.koboldai_vars.breakmodel or utils.koboldai_vars.usegpu + ) + ): + dtype = torch.float16 + if breakmodel.primary_device == "cpu" or ( + not utils.koboldai_vars.usegpu + and not utils.koboldai_vars.breakmodel + ): + dtype = torch.float32 + if ( + name in model_dict + and model_dict[name].dtype is not dtype + ): + model_dict[name] = model_dict[name].to(dtype) + if tensor.dtype is not dtype: + tensor = tensor.to(dtype) + if name not in utils.offload_index: + accelerate.utils.offload_weight( + tensor, + name, + "accelerate-disk-cache", + index=utils.offload_index, + ) + accelerate.utils.save_offload_index( + utils.offload_index, "accelerate-disk-cache" + ) + utils.bar.close() + utils.bar = None + utils.koboldai_vars.status_message = "" + lazy_load_callback.nested = False + if isinstance(f, zipfile.ZipExtFile): + f.close() + + lazy_load_callback.nested = False + return lazy_load_callback + + @contextlib.contextmanager + def _maybe_use_float16(self, always_use: bool = False): + if always_use or (utils.koboldai_vars.hascuda and self.low_mem and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel)): + original_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + yield True + torch.set_default_dtype(original_dtype) + else: + yield False + + def breakmodel_device_list(self, n_layers, primary=None, selected=None): + # TODO: Find a better place for this or rework this + + # HACK: Tttttttterrrible structure_hack + class colors: + PURPLE = '\033[95m' + BLUE = '\033[94m' + CYAN = '\033[96m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + END = '\033[0m' + UNDERLINE = '\033[4m' + + device_count = torch.cuda.device_count() + if(device_count < 2): + primary = None + gpu_blocks = breakmodel.gpu_blocks + (device_count - len(breakmodel.gpu_blocks))*[0] + print(f"{colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{colors.END}") + for i in range(device_count): + name = torch.cuda.get_device_name(i) + if(len(name) > 47): + name = "..." + name[-44:] + row_color = colors.END + sep_color = colors.YELLOW + print(f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}") + row_color = colors.END + sep_color = colors.YELLOW + if(utils.HAS_ACCELERATE): + print(f"{row_color}{colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){colors.END}") + print(f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}") + + def breakmodel_device_config(self, config): + # TODO: Find a better place for this or rework this + + # HACK: Tttttttterrrible structure_hack + class colors: + PURPLE = '\033[95m' + BLUE = '\033[94m' + CYAN = '\033[96m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + END = '\033[0m' + UNDERLINE = '\033[4m' + + global breakmodel, generator + import breakmodel + n_layers = utils.num_layers(config) + + if utils.args.cpu: + breakmodel.gpu_blocks = [0]*n_layers + return + + elif(utils.args.breakmodel_gpulayers is not None or (utils.HAS_ACCELERATE and utils.args.breakmodel_disklayers is not None)): + try: + if(not utils.args.breakmodel_gpulayers): + breakmodel.gpu_blocks = [] + else: + breakmodel.gpu_blocks = list(map(int, utils.args.breakmodel_gpulayers.split(','))) + assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() + s = n_layers + for i in range(len(breakmodel.gpu_blocks)): + if(breakmodel.gpu_blocks[i] <= -1): + breakmodel.gpu_blocks[i] = s + break + else: + s -= breakmodel.gpu_blocks[i] + assert sum(breakmodel.gpu_blocks) <= n_layers + n_layers -= sum(breakmodel.gpu_blocks) + if(utils.args.breakmodel_disklayers is not None): + assert utils.args.breakmodel_disklayers <= n_layers + breakmodel.disk_blocks = utils.args.breakmodel_disklayers + n_layers -= utils.args.breakmodel_disklayers + except: + logger.warning("--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.") + breakmodel.gpu_blocks = [n_layers] + n_layers = 0 + elif(utils.args.breakmodel_layers is not None): + breakmodel.gpu_blocks = [n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))] + n_layers -= sum(breakmodel.gpu_blocks) + elif(utils.args.model is not None): + logger.info("Breakmodel not specified, assuming GPU 0") + breakmodel.gpu_blocks = [n_layers] + n_layers = 0 + else: + device_count = torch.cuda.device_count() + if(device_count > 1): + print(colors.CYAN + "\nPlease select one of your GPUs to be your primary GPU.") + print("VRAM usage in your primary GPU will be higher than for your other ones.") + print("It is recommended you make your fastest GPU your primary GPU.") + self.breakmodel_device_list(n_layers) + while(True): + primaryselect = input("device ID> ") + if(primaryselect.isnumeric() and 0 <= int(primaryselect) < device_count): + breakmodel.primary_device = int(primaryselect) + break + else: + print(f"{colors.RED}Please enter an integer between 0 and {device_count-1}.{colors.END}") + else: + breakmodel.primary_device = 0 + + print(colors.PURPLE + "\nIf you don't have enough VRAM to run the model on a single GPU") + print("you can split the model between your CPU and your GPU(s), or between") + print("multiple GPUs if you have more than one.") + print("By putting more 'layers' on a GPU or CPU, more computations will be") + print("done on that device and more VRAM or RAM will be required on that device") + print("(roughly proportional to number of layers).") + print("It should be noted that GPUs are orders of magnitude faster than the CPU.") + print(f"This model has{colors.YELLOW} {n_layers} {colors.PURPLE}layers.{colors.END}\n") + + for i in range(device_count): + self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device, selected=i) + print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n") + while(True): + layerselect = input("# of layers> ") + if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers): + layerselect = int(layerselect) + layerselect = n_layers if layerselect == -1 else layerselect + breakmodel.gpu_blocks.append(layerselect) + n_layers -= layerselect + break + else: + print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}") + if(n_layers == 0): + break + + if(utils.HAS_ACCELERATE and n_layers > 0): + self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device, selected=-1) + print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n") + while(True): + layerselect = input("# of layers> ") + if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers): + layerselect = int(layerselect) + layerselect = n_layers if layerselect == -1 else layerselect + breakmodel.disk_blocks = layerselect + n_layers -= layerselect + break + else: + print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}") + + logger.init_ok("Final device configuration:", status="Info") + self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) + + # If all layers are on the same device, use the old GPU generation mode + while(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0): + breakmodel.gpu_blocks.pop() + if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, utils.num_layers(config))): + utils.koboldai_vars.breakmodel = False + utils.koboldai_vars.usegpu = True + utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks)-1 + return + + if(not breakmodel.gpu_blocks): + logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") + import breakmodel + breakmodel.primary_device = "cpu" + utils.koboldai_vars.breakmodel = False + utils.koboldai_vars.usegpu = False + return + + +class GenericHFTorchInferenceModel(HFTorchInferenceModel): + def _load(self, save_model: bool) -> None: + utils.koboldai_vars.allowsp = True + + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + if utils.koboldai_vars.model == "NeoCustom": + utils.koboldai_vars.model = os.path.basename(os.path.normpath(utils.koboldai_vars.custmodpth)) + + # If we specify a model and it's in the root directory, we need to move + # it to the models directory (legacy folder structure to new) + if self.get_local_model_path(legacy=True): + shutil.move( + self.get_local_model_path(legacy=True, ignore_existance=True), + self.get_local_model_path(ignore_existance=True) + ) + + # Get the model_type from the config or assume a model type if it isn't present + try: + model_config = AutoConfig.from_pretrained(self.get_local_model_path() or utils.koboldai_vars.model, revision=utils.koboldai_vars.revision, cache_dir="cache") + utils.koboldai_vars.model_type = model_config.model_type + except ValueError as e: + utils.koboldai_vars.model_type = { + "NeoCustom": "gpt_neo", + "GPT2Custom": "gpt2", + }.get(utils.koboldai_vars.model) + + if not utils.koboldai_vars.model_type: + logger.warning("No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") + utils.koboldai_vars.model_type = "gpt_neo" + + + tf_kwargs = { + "low_cpu_mem_usage": True, + } + + if utils.koboldai_vars.model_type == "gpt2": + # We must disable low_cpu_mem_usage and if using a GPT-2 model + # because GPT-2 is not compatible with this feature yet. + tf_kwargs.pop("low_cpu_mem_usage", None) + + # Also, lazy loader doesn't support GPT-2 models + utils.koboldai_vars.lazy_load = False + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + if utils.koboldai_vars.lazy_load and utils.koboldai_vars.hascuda and utils.koboldai_vars.breakmodel and not utils.koboldai_vars.nobreakmodel: + self.breakmodel_device_config(model_config) + + if utils.koboldai_vars.lazy_load: + # If we're using lazy loader, we need to figure out what the model's hidden layers are called + with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): + try: + metamodel = AutoModelForCausalLM.from_config(model_config) + except Exception as e: + metamodel = GPTNeoForCausalLM.from_config(model_config) + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + + # Download model from Huggingface if it does not exist, otherwise load locally + with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load( + enable=utils.koboldai_vars.lazy_load, + callback=self._get_lazy_load_callback(utils.num_layers(model_config)) if utils.koboldai_vars.lazy_load else None, + dematerialized_modules=True + ): + if utils.koboldai_vars.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + else: + # Model not stored locally, we need to download it. + + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor + def new_rebuild_tensor( + storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], + storage_offset, + shape, + stride + ): + if not isinstance(storage, torch_lazy_loader.LazyTensor): + dtype = storage.dtype + else: + dtype = storage.storage_type.dtype + if not isinstance(dtype, torch.dtype): + dtype = storage.storage_type(0).dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) + + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(utils.koboldai_vars.model, tf_kwargs) + torch._utils._rebuild_tensor = old_rebuild_tensor + + if save_model: + self.tokenizer.save_pretrained(self.get_local_model_path(ignore_existance=True)) + + if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + model = model.half() + model.save_pretrained(self.get_local_model_path(ignore_existance=True), max_shard_size="500MiB") + + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub + + legacy = packaging.version.parse(transformers_version) < packaging.version.parse("4.22.0.dev0") + # Save the config.json + shutil.move( + os.path.realpath(huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + transformers.configuration_utils.CONFIG_NAME, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=legacy + )), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME + ) + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + for possible_weight_name in [transformers.modeling_utils.WEIGHTS_NAME, "model.safetensors"]: + try: + shutil.move( + os.path.realpath(huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + possible_weight_name, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=legacy + )), + os.path.join( + self.get_local_model_path(ignore_existance=True), + possible_weight_name, + ) + ) + except Exception as e: + if possible_weight_name == "model.safetensors": + raise e + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME + ) + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath(huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=legacy + )), + os.path.join( + self.get_local_model_path(ignore_existance=True), + filename + ) + ) + shutil.rmtree("cache/") + + if utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj"): + utils.koboldai_vars.badwordsids = [[v] for k, v in self.tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]") if utils.koboldai_vars.newlinemode != "s" or str(k) != ""] + + self.patch_embedding() + + if utils.koboldai_vars.hascuda: + if utils.koboldai_vars.usegpu: + # Use just VRAM + model = model.half().to(utils.koboldai_vars.gpu_device) + elif utils.koboldai_vars.breakmodel: + # Use both RAM and VRAM (breakmodel) + if not utils.koboldai_vars.lazy_load: + self.breakmodel_device_config(model.config) + self._move_to_devices() + elif breakmodel.disk_blocks > 0: + # Use disk + self._move_to_devices() + elif breakmodel.disk_blocks > 0: + self._move_to_devices() + else: + # Use CPU + self.model = self.model.to('cpu').float() + elif breakmodel.disk_blocks > 0: + self._move_to_devices() + else: + self.model = self.model.to('cpu').float() + utils.koboldai_vars.modeldim = self.get_hidden_size() + + +class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): + def _load(self, save_model: bool) -> None: + utils.koboldai_vars.lazy_load = False + + model_path = None + + for possible_config_path in [ + utils.koboldai_vars.custmodpth, + os.path.join("models", utils.koboldai_vars.custmodpth) + ]: + try: + with open(os.path.join(possible_config_path, "config.json"), "r") as file: + # Unused? + self.model_config = json.load(file) + model_path = possible_config_path + break + except FileNotFoundError: + pass + + if not model_path: + raise RuntimeError("Empty model_path!") + + with self._maybe_use_float16(): + try: + self.model = GPT2LMHeadModel.from_pretrained(utils.koboldai_vars.custmodpth, revision=utils.koboldai_vars.revision, cache_dir="cache") + self.tokenizer = GPT2Tokenizer.from_pretrained(utils.koboldai_vars.custmodpth, revision=utils.koboldai_vars.revision, cache_dir="cache") + except Exception as e: + if "out of memory" in traceback.format_exc().lower(): + raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + raise e + + if save_model: + self.model.save_pretrained(self.get_local_model_path(ignore_existance=True), max_shard_size="500MiB") + self.tokenizer.save_pretrained(self.get_local_model_path(ignore_existance=True)) + + utils.koboldai_vars.modeldim = self.get_hidden_size() + + # Is CUDA available? If so, use GPU, otherwise fall back to CPU + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu: + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + else: + self.model = self.model.to("cpu").float() + + self.patch_causal_lm() \ No newline at end of file diff --git a/utils.py b/utils.py index 87ee77a5..7808729c 100644 --- a/utils.py +++ b/utils.py @@ -633,4 +633,20 @@ def get_missing_module_names(model: PreTrainedModel, names: List[str]) -> List[s else: recurse(c[1], head=name + ".") recurse(model) - return missing_names \ No newline at end of file + return missing_names + +class UIProgressBarFile(object): + """Write TQDM progress to the UI.""" + def write(self, bar): + bar = bar.replace("\r", "").replace("\n", "").replace(chr(0), "") + if bar != "" and [ord(num) for num in bar] != [27, 91, 65]: #No idea why we're getting the 27, 1, 65 character set, just killing to so we can move on + #logger.info(bar) + print('\r' + bar, end='') + time.sleep(0.01) + try: + emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", " ")}, broadcast=True, room="UI_1") + except: + pass + + def flush(self): + pass \ No newline at end of file