Merge pull request #156 from VE-FORBRYDERNE/accelerate

Accelerate disk cache support
This commit is contained in:
henk717
2022-06-21 00:31:50 +02:00
committed by GitHub
6 changed files with 237 additions and 44 deletions

1
.gitignore vendored
View File

@@ -15,6 +15,7 @@ bin
__pycache__ __pycache__
*.log *.log
cache cache
accelerate-disk-cache
userscripts userscripts
!userscripts/examples !userscripts/examples
!userscripts/kaipreset_*.lua !userscripts/kaipreset_*.lua

View File

@@ -507,14 +507,19 @@ def device_list(n_layers, primary=None, selected=None):
print(f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}") print(f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}")
row_color = colors.END row_color = colors.END
sep_color = colors.YELLOW sep_color = colors.YELLOW
if(utils.HAS_ACCELERATE):
print(f"{row_color}{colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){colors.END}")
print(f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}") print(f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}")
def device_config(config): def device_config(config):
global breakmodel, generator global breakmodel, generator
import breakmodel import breakmodel
n_layers = utils.num_layers(config) n_layers = utils.num_layers(config)
if(args.breakmodel_gpulayers is not None): if(args.breakmodel_gpulayers is not None or (utils.HAS_ACCELERATE and args.breakmodel_disklayers is not None)):
try: try:
if(not args.breakmodel_gpulayers):
breakmodel.gpu_blocks = []
else:
breakmodel.gpu_blocks = list(map(int, args.breakmodel_gpulayers.split(','))) breakmodel.gpu_blocks = list(map(int, args.breakmodel_gpulayers.split(',')))
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
s = n_layers s = n_layers
@@ -526,6 +531,10 @@ def device_config(config):
s -= breakmodel.gpu_blocks[i] s -= breakmodel.gpu_blocks[i]
assert sum(breakmodel.gpu_blocks) <= n_layers assert sum(breakmodel.gpu_blocks) <= n_layers
n_layers -= sum(breakmodel.gpu_blocks) n_layers -= sum(breakmodel.gpu_blocks)
if(args.breakmodel_disklayers is not None):
assert args.breakmodel_disklayers <= n_layers
breakmodel.disk_blocks = args.breakmodel_disklayers
n_layers -= args.breakmodel_disklayers
except: except:
print("WARNING: --breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.", file=sys.stderr) print("WARNING: --breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.", file=sys.stderr)
breakmodel.gpu_blocks = [n_layers] breakmodel.gpu_blocks = [n_layers]
@@ -579,6 +588,20 @@ def device_config(config):
if(n_layers == 0): if(n_layers == 0):
break break
if(utils.HAS_ACCELERATE and n_layers > 0):
device_list(n_layers, primary=breakmodel.primary_device, selected=-1)
print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n")
while(True):
layerselect = input("# of layers> ")
if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers):
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.disk_blocks = layerselect
n_layers -= layerselect
break
else:
print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}")
print(colors.PURPLE + "\nFinal device configuration:") print(colors.PURPLE + "\nFinal device configuration:")
device_list(n_layers) device_list(n_layers)
@@ -593,6 +616,8 @@ def device_config(config):
if(not breakmodel.gpu_blocks): if(not breakmodel.gpu_blocks):
print("Nothing assigned to a GPU, reverting to CPU only mode") print("Nothing assigned to a GPU, reverting to CPU only mode")
import breakmodel
breakmodel.primary_device = "cpu"
vars.breakmodel = False vars.breakmodel = False
vars.usegpu = False vars.usegpu = False
return return
@@ -600,7 +625,7 @@ def device_config(config):
def move_model_to_devices(model): def move_model_to_devices(model):
global generator global generator
if(not vars.breakmodel): if(not utils.HAS_ACCELERATE and not vars.breakmodel):
if(vars.usegpu): if(vars.usegpu):
model = model.half().to(vars.gpu_device) model = model.half().to(vars.gpu_device)
else: else:
@@ -608,26 +633,27 @@ def move_model_to_devices(model):
generator = model.generate generator = model.generate
return return
model.half()
gc.collect()
if(utils.HAS_ACCELERATE): if(utils.HAS_ACCELERATE):
import accelerate import breakmodel
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks gpu_blocks = breakmodel.gpu_blocks
ram_blocks = len(vars.layers_module_names) - sum(gpu_blocks) ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
device_map = {} device_map = {}
for name in vars.layers_module_names: for name in utils.layers_module_names:
layer = int(name.rsplit(".", 1)[1]) layer = int(name.rsplit(".", 1)[1])
device = "cpu" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) device = ("disk" if layer < disk_blocks else "cpu") if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
device_map[name] = device device_map[name] = device
for name in utils.get_missing_module_names(model, list(device_map.keys())): for name in utils.get_missing_module_names(model, list(device_map.keys())):
device_map[name] = breakmodel.primary_device device_map[name] = breakmodel.primary_device
accelerate.dispatch_model(model, device_map, main_device=breakmodel.primary_device) breakmodel.dispatch_model_ex(model, device_map, main_device=breakmodel.primary_device, offload_buffers=True, offload_dir="accelerate-disk-cache")
gc.collect() gc.collect()
generator = model.generate generator = model.generate
return return
model.half()
gc.collect()
if(hasattr(model, "transformer")): if(hasattr(model, "transformer")):
model.transformer.wte.to(breakmodel.primary_device) model.transformer.wte.to(breakmodel.primary_device)
model.transformer.ln_f.to(breakmodel.primary_device) model.transformer.ln_f.to(breakmodel.primary_device)
@@ -978,6 +1004,7 @@ def general_startup(override_args=None):
parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS) parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS)
parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS) parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS)
parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11") parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11")
parser.add_argument("--breakmodel_disklayers", type=int, help="If using a model that supports hybrid generation, this is the number of layers to put in disk cache.")
parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.") parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.")
parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.") parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.")
parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.") parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.")
@@ -1083,6 +1110,7 @@ def tpumtjgetsofttokens():
def get_model_info(model, directory=""): def get_model_info(model, directory=""):
# if the model is in the api list # if the model is in the api list
disk_blocks = 0
key = False key = False
breakmodel = False breakmodel = False
gpu = False gpu = False
@@ -1109,7 +1137,7 @@ def get_model_info(model, directory=""):
pass pass
elif model == 'Colab': elif model == 'Colab':
url = True url = True
elif not torch.cuda.is_available(): elif not utils.HAS_ACCELERATE and not torch.cuda.is_available():
pass pass
else: else:
layer_count = get_layer_count(model, directory=directory) layer_count = get_layer_count(model, directory=directory)
@@ -1119,7 +1147,11 @@ def get_model_info(model, directory=""):
breakmodel = True breakmodel = True
if path.exists("settings/{}.breakmodel".format(model.replace("/", "_"))): if path.exists("settings/{}.breakmodel".format(model.replace("/", "_"))):
with open("settings/{}.breakmodel".format(model.replace("/", "_")), "r") as file: with open("settings/{}.breakmodel".format(model.replace("/", "_")), "r") as file:
break_values = file.read().split(",") data = file.read().split("\n")[:2]
if len(data) < 2:
data.append("0")
break_values, disk_blocks = data
break_values = break_values.split(",")
else: else:
break_values = [layer_count] break_values = [layer_count]
break_values += [0] * (gpu_count - len(break_values)) break_values += [0] * (gpu_count - len(break_values))
@@ -1129,6 +1161,7 @@ def get_model_info(model, directory=""):
# 'url': url, 'gpu_names': gpu_names})) # 'url': url, 'gpu_names': gpu_names}))
emit('from_server', {'cmd': 'selected_model_info', 'key_value': key_value, 'key':key, emit('from_server', {'cmd': 'selected_model_info', 'key_value': key_value, 'key':key,
'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel,
'disk_break_value': disk_blocks, 'accelerate': utils.HAS_ACCELERATE,
'break_values': break_values, 'gpu_count': gpu_count, 'break_values': break_values, 'gpu_count': gpu_count,
'url': url, 'gpu_names': gpu_names}, broadcast=True) 'url': url, 'gpu_names': gpu_names}, broadcast=True)
if key_value != "": if key_value != "":
@@ -1470,13 +1503,15 @@ def patch_transformers():
return stopping_criteria return stopping_criteria
transformers.generation_utils.GenerationMixin._get_stopping_criteria = new_get_stopping_criteria transformers.generation_utils.GenerationMixin._get_stopping_criteria = new_get_stopping_criteria
def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model=""): def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model=""):
global model global model
global generator global generator
global torch global torch
global model_config global model_config
global GPT2TokenizerFast global GPT2TokenizerFast
global tokenizer global tokenizer
if not utils.HAS_ACCELERATE:
disk_layers = None
vars.noai = False vars.noai = False
if not initial_load: if not initial_load:
set_aibusy(True) set_aibusy(True)
@@ -1486,6 +1521,8 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
time.sleep(0.1) time.sleep(0.1)
if gpu_layers is not None: if gpu_layers is not None:
args.breakmodel_gpulayers = gpu_layers args.breakmodel_gpulayers = gpu_layers
if disk_layers is not None:
args.breakmodel_disklayers = int(disk_layers)
#We need to wipe out the existing model and refresh the cuda cache #We need to wipe out the existing model and refresh the cuda cache
model = None model = None
@@ -1579,10 +1616,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr) print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr)
if(args.breakmodel_layers is not None): if(args.breakmodel_layers is not None):
print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr) print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr)
if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers): if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers and (not utils.HAS_ACCELERATE or not args.breakmodel_disklayers)):
print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr) print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr)
vars.bmsupported = False vars.bmsupported = False
if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None)): if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None or args.breakmodel_disklayers is not None)):
print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr) print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr)
if(vars.hascuda): if(vars.hascuda):
print("{0}FOUND!{1}".format(colors.GREEN, colors.END)) print("{0}FOUND!{1}".format(colors.GREEN, colors.END))
@@ -1593,13 +1630,13 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
if(vars.hascuda): if(vars.hascuda):
genselected = True genselected = True
vars.usegpu = True vars.usegpu = True
vars.breakmodel = False vars.breakmodel = utils.HAS_ACCELERATE
if(vars.bmsupported): if(vars.bmsupported):
vars.usegpu = False vars.usegpu = False
vars.breakmodel = True vars.breakmodel = True
if(args.cpu): if(args.cpu):
vars.usegpu = False vars.usegpu = False
vars.breakmodel = False vars.breakmodel = utils.HAS_ACCELERATE
elif(vars.hascuda): elif(vars.hascuda):
if(vars.bmsupported): if(vars.bmsupported):
genselected = True genselected = True
@@ -1621,7 +1658,7 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
vars.usegpu = True vars.usegpu = True
genselected = True genselected = True
else: else:
vars.breakmodel = False vars.breakmodel = utils.HAS_ACCELERATE
vars.usegpu = False vars.usegpu = False
genselected = True genselected = True
@@ -1661,12 +1698,19 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
from tqdm.auto import tqdm from tqdm.auto import tqdm
if "breakmodel" in globals(): global breakmodel
import breakmodel
if utils.HAS_ACCELERATE:
import accelerate.utils
if args.breakmodel_disklayers is not None:
breakmodel.disk_blocks = args.breakmodel_disklayers
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks gpu_blocks = breakmodel.gpu_blocks
ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
else:
ram_blocks = gpu_blocks = cumulative_gpu_blocks = None
def lazy_load_callback(model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], f, **_): def lazy_load_callback(model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], f, **_):
if lazy_load_callback.nested: if lazy_load_callback.nested:
@@ -1675,15 +1719,31 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
device_map: Dict[str, Union[str, int]] = {} device_map: Dict[str, Union[str, int]] = {}
@functools.lru_cache(maxsize=None)
def get_original_key(key):
return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
for key, value in model_dict.items(): for key, value in model_dict.items():
if isinstance(value, torch_lazy_loader.LazyTensor) and not any(key.startswith(n) or key.startswith(n.split(".", 1)[1]) for n in vars.layers_module_names): original_key = get_original_key(key)
if isinstance(value, torch_lazy_loader.LazyTensor) and not any(original_key.startswith(n) for n in utils.layers_module_names):
device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel else breakmodel.primary_device device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel else breakmodel.primary_device
else: else:
layer = int(max((n for n in vars.layers_module_names if key.startswith(n) or key.startswith(n.split(".", 1)[1])), key=len).rsplit(".", 1)[1]) layer = int(max((n for n in utils.layers_module_names if original_key.startswith(n)), key=len).rsplit(".", 1)[1])
device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel else "shared" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) device = vars.gpu_device if vars.hascuda and vars.usegpu else "disk" if layer < disk_blocks and layer < ram_blocks else "cpu" if not vars.hascuda or not vars.breakmodel else "shared" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
device_map[key] = device device_map[key] = device
if utils.num_shards is None or utils.current_shard == 0: if utils.num_shards is None or utils.current_shard == 0:
utils.offload_index = {}
if utils.HAS_ACCELERATE:
if os.path.isdir("accelerate-disk-cache"):
# Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
# (the folder doesn't contain any subfolders so os.remove will do just fine)
for filename in os.listdir("accelerate-disk-cache"):
try:
os.remove(os.path.join("accelerate-disk-cache", filename))
except OSError:
pass
os.makedirs("accelerate-disk-cache", exist_ok=True)
if utils.num_shards is not None: if utils.num_shards is not None:
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
else: else:
@@ -1714,13 +1774,13 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1) size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1)
dtype = model_dict[key].dtype dtype = model_dict[key].dtype
nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
#print(f"Transferring <{key}> to {'(CPU)' if device == 'cpu' else '[device ' + str(device) + ']'} ... ", end="", flush=True) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
model_dict[key] = model_dict[key].materialize(f, map_location="cpu") model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
if model_dict[key].dtype is torch.float32: if model_dict[key].dtype is torch.float32:
vars.fp32_model = True vars.fp32_model = True
if convert_to_float16 and vars.hascuda and (vars.breakmodel or vars.usegpu) and model_dict[key].dtype is torch.float32: if convert_to_float16 and breakmodel.primary_device != "cpu" and vars.hascuda and (vars.breakmodel or vars.usegpu) and model_dict[key].dtype is torch.float32:
model_dict[key] = model_dict[key].to(torch.float16) model_dict[key] = model_dict[key].to(torch.float16)
if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16: if breakmodel.primary_device == "cpu" or (not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16):
model_dict[key] = model_dict[key].to(torch.float32) model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared": if device == "shared":
model_dict[key] = model_dict[key].to("cpu").detach_() model_dict[key] = model_dict[key].to("cpu").detach_()
@@ -1729,6 +1789,9 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
model_dict[key] = model_dict[key].pin_memory() model_dict[key] = model_dict[key].pin_memory()
except: except:
able_to_pin_layers = False able_to_pin_layers = False
elif device == "disk":
accelerate.utils.offload_weight(model_dict[key], get_original_key(key), "accelerate-disk-cache", index=utils.offload_index)
model_dict[key] = model_dict[key].to("meta")
else: else:
model_dict[key] = model_dict[key].to(device) model_dict[key] = model_dict[key].to(device)
#print("OK", flush=True) #print("OK", flush=True)
@@ -1736,6 +1799,11 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
utils.bar.update(1) utils.bar.update(1)
finally: finally:
if utils.num_shards is None or utils.current_shard >= utils.num_shards: if utils.num_shards is None or utils.current_shard >= utils.num_shards:
if utils.offload_index:
for name, tensor in utils.named_buffers:
if name not in utils.offload_index:
accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
utils.bar.close() utils.bar.close()
utils.bar = None utils.bar = None
lazy_load_callback.nested = False lazy_load_callback.nested = False
@@ -1811,7 +1879,7 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
# If we're using torch_lazy_loader, we need to get breakmodel config # If we're using torch_lazy_loader, we need to get breakmodel config
# early so that it knows where to load the individual model tensors # early so that it knows where to load the individual model tensors
if(vars.lazy_load and vars.hascuda and vars.breakmodel): if(utils.HAS_ACCELERATE or vars.lazy_load and vars.hascuda and vars.breakmodel):
device_config(model_config) device_config(model_config)
# Download model from Huggingface if it does not exist, otherwise load locally # Download model from Huggingface if it does not exist, otherwise load locally
@@ -1827,7 +1895,9 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
metamodel = AutoModelForCausalLM.from_config(model_config) metamodel = AutoModelForCausalLM.from_config(model_config)
except Exception as e: except Exception as e:
metamodel = GPTNeoForCausalLM.from_config(model_config) metamodel = GPTNeoForCausalLM.from_config(model_config)
vars.layers_module_names = utils.get_layers_module_names(metamodel) utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True): with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True):
if(vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time if(vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
lowmem = {} lowmem = {}
@@ -1939,10 +2009,18 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
if(not vars.lazy_load): if(not vars.lazy_load):
device_config(model.config) device_config(model.config)
move_model_to_devices(model) move_model_to_devices(model)
elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
move_model_to_devices(model)
vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate
else: else:
model = model.to('cpu').float() model = model.to('cpu').float()
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate generator = model.generate
elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
move_model_to_devices(model)
vars.modeldim = get_hidden_size_from_model(model)
generator = model.generate
else: else:
model.to('cpu').float() model.to('cpu').float()
vars.modeldim = get_hidden_size_from_model(model) vars.modeldim = get_hidden_size_from_model(model)
@@ -3143,16 +3221,22 @@ def get_message(msg):
if not os.path.exists("settings/"): if not os.path.exists("settings/"):
os.mkdir("settings") os.mkdir("settings")
changed = True changed = True
if not utils.HAS_ACCELERATE:
msg['disk_layers'] = "0"
if os.path.exists("settings/" + vars.model.replace('/', '_') + ".breakmodel"): if os.path.exists("settings/" + vars.model.replace('/', '_') + ".breakmodel"):
with open("settings/" + vars.model.replace('/', '_') + ".breakmodel", "r") as file: with open("settings/" + vars.model.replace('/', '_') + ".breakmodel", "r") as file:
if file.read() == msg['gpu_layers']: data = file.read().split('\n')[:2]
if len(data) < 2:
data.append("0")
gpu_layers, disk_layers = data
if gpu_layers == msg['gpu_layers'] and disk_layers == msg['disk_layers']:
changed = False changed = False
if changed: if changed:
f = open("settings/" + vars.model.replace('/', '_') + ".breakmodel", "w") f = open("settings/" + vars.model.replace('/', '_') + ".breakmodel", "w")
f.write(msg['gpu_layers']) f.write(msg['gpu_layers'] + '\n' + msg['disk_layers'])
f.close() f.close()
vars.colaburl = msg['url'] + "/request" vars.colaburl = msg['url'] + "/request"
load_model(use_gpu=msg['use_gpu'], gpu_layers=msg['gpu_layers'], online_model=msg['online_model']) load_model(use_gpu=msg['use_gpu'], gpu_layers=msg['gpu_layers'], disk_layers=msg['disk_layers'], online_model=msg['online_model'])
elif(msg['cmd'] == 'show_model'): elif(msg['cmd'] == 'show_model'):
print("Model Name: {}".format(getmodelname())) print("Model Name: {}".format(getmodelname()))
emit('from_server', {'cmd': 'show_model_name', 'data': getmodelname()}, broadcast=True) emit('from_server', {'cmd': 'show_model_name', 'data': getmodelname()}, broadcast=True)

View File

@@ -4,7 +4,7 @@ https://github.com/arrmansa/Basic-UI-for-GPT-J-6B-with-low-vram/blob/main/GPT-J-
The ORIGINAL version of the patch is released under the Apache License 2.0 The ORIGINAL version of the patch is released under the Apache License 2.0
Copyright 2021 arrmansa Copyright 2021 arrmansa
Copyright 2021 finetuneanon Copyright 2021 finetuneanon
Copyright 2018 The Hugging Face team Copyright 2018, 2022 The Hugging Face team
Apache License Apache License
@@ -216,11 +216,13 @@ from torch import nn
import torch.cuda.comm import torch.cuda.comm
import copy import copy
import gc import gc
import os
import sys import sys
import itertools import itertools
import bisect import bisect
import random import random
from typing import Optional import utils
from typing import Dict, List, Optional, Union
from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPastAndCrossAttentions from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPastAndCrossAttentions
@@ -230,7 +232,100 @@ logger = logging.get_logger(__name__)
breakmodel = True breakmodel = True
gpu_blocks = [] gpu_blocks = []
primary_device = 0 disk_blocks = 0
primary_device = 0 if torch.cuda.device_count() > 0 else "cpu"
if utils.HAS_ACCELERATE:
from accelerate.hooks import attach_align_device_hook_on_blocks
from accelerate.utils import OffloadedWeightsLoader, check_device_map, extract_submodules_state_dict, offload_state_dict
from accelerate import dispatch_model
def dispatch_model_ex(
model: nn.Module,
device_map: Dict[str, Union[str, int, torch.device]],
main_device: Optional[torch.device] = None,
state_dict: Optional[Dict[str, torch.Tensor]] = None,
offload_dir: Union[str, os.PathLike] = None,
offload_buffers: bool = False,
**kwargs,
):
"""
This is a modified version of
https://github.com/huggingface/accelerate/blob/eeaba598f455fbd2c48661d7e816d3ff25ab050b/src/accelerate/big_modeling.py#L130
that still works when the main device is the CPU.
Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on
the CPU or even the disk.
Args:
model (`torch.nn.Module`):
The model to dispatch.
device_map (`Dict[str, Union[str, int, torch.device]]`):
A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that
`"disk"` is accepted even if it's not a proper value for `torch.device`.
main_device (`str`, `int` or `torch.device`, *optional*):
The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or
`"disk"`.
state_dict (`Dict[str, torch.Tensor]`, *optional*):
The state dict of the part of the model that will be kept on CPU.
offload_dir (`str` or `os.PathLike`):
The folder in which to offload the model weights (or where the model weights are already offloaded).
offload_buffers (`bool`, *optional*, defaults to `False`):
Whether or not to offload the buffers with the model parameters.
preload_module_classes (`List[str]`, *optional*):
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
of the forward. This should only be used for classes that have submodules which are registered but not
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
"""
if main_device != "cpu":
return dispatch_model(model, device_map, main_device, state_dict, offload_dir=offload_dir, offload_buffers=offload_buffers, **kwargs)
# Error early if the device map is incomplete.
check_device_map(model, device_map)
offload_devices = ["cpu", "disk"] if main_device != "cpu" else ["disk"]
if main_device is None:
main_device = [d for d in device_map.values() if d not in offload_devices][0]
cpu_modules = [name for name, device in device_map.items() if device == "cpu"] if main_device != "cpu" else []
if state_dict is None and len(cpu_modules) > 0:
state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules)
disk_modules = [name for name, device in device_map.items() if device == "disk"]
if offload_dir is None and len(disk_modules) > 0:
raise ValueError(
"We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules "
f"need to be offloaded: {', '.join(disk_modules)}."
)
if len(disk_modules) > 0 and (
not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json"))
):
disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules)
offload_state_dict(offload_dir, disk_state_dict)
execution_device = {
name: main_device if device in offload_devices else device for name, device in device_map.items()
}
offload = {name: device in offload_devices for name, device in device_map.items()}
save_folder = offload_dir if len(disk_modules) > 0 else None
if state_dict is not None or save_folder is not None:
weights_map = OffloadedWeightsLoader(state_dict=state_dict, save_folder=save_folder)
else:
weights_map = None
attach_align_device_hook_on_blocks(
model,
execution_device=execution_device,
offload=offload,
offload_buffers=offload_buffers,
weights_map=weights_map,
**kwargs,
)
model.hf_device_map = device_map
return model
# Copied from transformers.models.bart.modeling_bart._expand_mask # Copied from transformers.models.bart.modeling_bart._expand_mask

View File

@@ -1966,6 +1966,10 @@ function update_gpu_layers() {
gpu_layers += parseInt($("#gpu_layers"+i)[0].value); gpu_layers += parseInt($("#gpu_layers"+i)[0].value);
$("#gpu_layers_box_"+i)[0].value=$("#gpu_layers"+i)[0].value; $("#gpu_layers_box_"+i)[0].value=$("#gpu_layers"+i)[0].value;
} }
if ($("#disk_layers").length > 0) {
gpu_layers += parseInt($("#disk_layers")[0].value);
$("#disk_layers_box")[0].value=$("#disk_layers")[0].value;
}
if (gpu_layers > parseInt(document.getElementById("gpu_layers_max").innerHTML)) { if (gpu_layers > parseInt(document.getElementById("gpu_layers_max").innerHTML)) {
disableButtons([load_model_accept]); disableButtons([load_model_accept]);
$("#gpu_layers_current").html("<span style='color: red'>"+gpu_layers+"/"+ document.getElementById("gpu_layers_max").innerHTML +"</span>"); $("#gpu_layers_current").html("<span style='color: red'>"+gpu_layers+"/"+ document.getElementById("gpu_layers_max").innerHTML +"</span>");
@@ -2609,6 +2613,10 @@ $(document).ready(function(){
html += 'onblur=\'$("#gpu_layers'+i+'")[0].value=$("#gpu_layers_box_'+i+'")[0].value;update_gpu_layers();\'>'; html += 'onblur=\'$("#gpu_layers'+i+'")[0].value=$("#gpu_layers_box_'+i+'")[0].value;update_gpu_layers();\'>';
html += "<input type='range' class='form-range airange' min='0' max='"+msg.layer_count+"' step='1' value='"+msg.break_values[i]+"' id='gpu_layers"+i+"' onchange='update_gpu_layers();'>"; html += "<input type='range' class='form-range airange' min='0' max='"+msg.layer_count+"' step='1' value='"+msg.break_values[i]+"' id='gpu_layers"+i+"' onchange='update_gpu_layers();'>";
} }
html += "Disk cache: ";
html += '<input inputmode="numeric" id="disk_layers_box" class="justifyright flex-push-right model_layers" value="'+msg.disk_break_value+'" ';
html += 'onblur=\'$("#disk_layers")[0].value=$("#disk_layers_box")[0].value;update_gpu_layers();\'>';
html += "<input type='range' class='form-range airange' min='0' max='"+msg.layer_count+"' step='1' value='"+msg.disk_break_value+"' id='disk_layers' onchange='update_gpu_layers();'>";
$("#model_layer_bars").html(html); $("#model_layer_bars").html(html);
$("#gpu_layers_max").html(msg.layer_count); $("#gpu_layers_max").html(msg.layer_count);
$("#gpu_count")[0].value = msg.gpu_count; $("#gpu_count")[0].value = msg.gpu_count;
@@ -2925,7 +2933,8 @@ $(document).ready(function(){
gpu_layers += $("#gpu_layers"+i)[0].value + ","; gpu_layers += $("#gpu_layers"+i)[0].value + ",";
} }
} }
message = {'cmd': 'load_model', 'use_gpu': $('#use_gpu')[0].checked, 'key': $('#modelkey')[0].value, 'gpu_layers': gpu_layers.slice(0, -1), 'url': $('#modelurl')[0].value, 'online_model': $('#oaimodel')[0].value}; var disk_layers = $("#disk_layers").length > 0 ? $("#disk_layers")[0].value : 0;
message = {'cmd': 'load_model', 'use_gpu': $('#use_gpu')[0].checked, 'key': $('#modelkey')[0].value, 'gpu_layers': gpu_layers.slice(0, -1), 'disk_layers': disk_layers, 'url': $('#modelurl')[0].value, 'online_model': $('#oaimodel')[0].value};
socket.send(message); socket.send(message);
loadmodelcontent.html(""); loadmodelcontent.html("");
hideLoadModelPopup(); hideLoadModelPopup();

View File

@@ -17,7 +17,7 @@
<script src="static/bootstrap.min.js"></script> <script src="static/bootstrap.min.js"></script>
<script src="static/bootstrap-toggle.min.js"></script> <script src="static/bootstrap-toggle.min.js"></script>
<script src="static/rangy-core.min.js"></script> <script src="static/rangy-core.min.js"></script>
<script src="static/application.js?ver=1.18.1b"></script> <script src="static/application.js?ver=1.18.1c"></script>
<script src="static/favicon.js"></script> <script src="static/favicon.js"></script>
{% if flaskwebgui %} {% if flaskwebgui %}
<script src="static/flask_web_gui.js"></script> <script src="static/flask_web_gui.js"></script>
@@ -304,9 +304,9 @@
<div class='settingitem' style="width:100%"> <div class='settingitem' style="width:100%">
<div class='settinglabel'> <div class='settinglabel'>
<div class="justifyleft"> <div class="justifyleft">
GPU Layers GPU/Disk Layers
<span class="helpicon">? <span class="helpicon">?
<span class="helptext">Number of layers to assign to the GPU</span> <span class="helptext">Number of layers to assign to GPUs and to disk cache. Remaining layers will be put into CPU RAM.</span>
</span> </span>
</div> </div>
<div class="justifyright" id="gpu_layers_current">0</div> <div class="justifyright" id="gpu_layers_current">0</div>

View File

@@ -29,6 +29,10 @@ from_pretrained_index_filename: Optional[str] = None
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
bar = None bar = None
layers_module_names: Optional[List[str]] = None
module_names: Optional[List[str]] = None
named_buffers: Optional[List[tuple]] = None
default_sampler_order = [0, 1, 2, 3, 4, 5] default_sampler_order = [0, 1, 2, 3, 4, 5]
#==================================================================# #==================================================================#