Disable low_cpu_mem_usage when using GPT-2

Attempting to use transformers 4.11.0's experimental `low_cpu_mem_usage` feature with GPT-2 models usually results in the output repeating a token over and over or otherwise containing an incoherent response.
2025-06-05 21:59:24 +02:00 · 2021-12-20 19:54:19 -05:00
parent 7b56940ed7
commit caef3b7460
1 changed files with 10 additions and 3 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -846,7 +846,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
            model_config = open(vars.custmodpth + "/config.json", "r")
            js   = json.load(model_config)
            with(maybe_use_float16()):
-                model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
+                model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/")
            tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache/")
            vars.modeldim = get_hidden_size_from_model(model)
            # Is CUDA available? If so, use GPU, otherwise fall back to CPU
@@ -858,17 +858,24 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
                generator = model.generate
        # If base HuggingFace model was chosen
        else:
+            lowmem = maybe_low_cpu_mem_usage()
+            # We must disable low_cpu_mem_usage (by setting lowmem to {}) if
+            # using a GPT-2 model because GPT-2 is not compatible with this
+            # feature yet
+            if("/" not in vars.model and vars.model.lower().startswith("gpt2")):
+                lowmem = {}
+
            # Is CUDA available? If so, use GPU, otherwise fall back to CPU
            
            if(os.path.isdir(vars.model.replace('/', '_'))):
               with(maybe_use_float16()):
                   tokenizer = GPT2TokenizerFast.from_pretrained(vars.model.replace('/', '_'), cache_dir="cache/")
-                   model = AutoModelForCausalLM.from_pretrained(vars.model.replace('/', '_'), cache_dir="cache/", **maybe_low_cpu_mem_usage())
+                   model = AutoModelForCausalLM.from_pretrained(vars.model.replace('/', '_'), cache_dir="cache/", **lowmem)
            else:
                print("Model does not exist locally, attempting to download from Huggingface...")
                tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, cache_dir="cache/")
                with(maybe_use_float16()):
-                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
+                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **lowmem)
                model = model.half()
                import shutil
                shutil.rmtree("cache/")