Lazy loader no longer requires map file except when loading to TPU

2025-06-05 21:59:24 +02:00 · 2022-06-16 18:45:11 -04:00
parent b0a01962ab
commit 5253cdcb36
2 changed files with 32 additions and 18 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -1652,18 +1652,14 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="

                    device_map = {}

-                    for _key, spec in lazy_load_spec.get("layer_weights", {}).items():
-                        for layer in range(n_layers):
-                            key = _key.format(layer=layer)
-                            if key not in model_dict:
-                                continue
+                    for key, value in model_dict.items():
+                        if isinstance(value, torch_lazy_loader.LazyTensor) and not any(key.startswith(n) or key.startswith(n.split(".", 1)[1]) for n in vars.layer_param_names):
+                            device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
+                        else:
+                            layer = int(next(n for n in vars.layer_param_names if key.startswith(n) or key.startswith(n.split(".", 1)[1])).rsplit(".", 1)[1])
                            device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
                            device_map[key] = device

-                    for key, value in model_dict.items():
-                        if isinstance(value, torch_lazy_loader.LazyTensor) and key not in device_map:
-                            device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
-
                    if utils.num_shards is None or utils.current_shard == 0:
                        if utils.num_shards is not None:
                            num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
@@ -1717,15 +1713,6 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
                lazy_load_callback.nested = False
                return lazy_load_callback

-            lazy_load_config_path = os.path.join("maps", vars.model_type + ".json")
-            if(vars.lazy_load and "model_config" in globals() and os.path.isfile(lazy_load_config_path)):
-                with open(lazy_load_config_path) as f:
-                    lazy_load_spec = json.load(f)
-
-            else:
-                vars.lazy_load = False
-
-            

            def get_hidden_size_from_model(model):
                try:
@@ -1800,6 +1787,13 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
                    import shutil
                    shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_')))
                print("\n", flush=True)
+                if(vars.lazy_load):  # If we're using lazy loader, we need to figure out what the model's hidden layers are called
+                    with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True):
+                        try:
+                            metamodel = AutoModelForCausalLM.from_config(model_config)
+                        except Exception as e:
+                            metamodel = GPTNeoForCausalLM.from_config(model_config)
+                        vars.layer_param_names = utils.get_layer_param_names(metamodel)
                with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True):
                    if(vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                        lowmem = {}