mirror of
				https://github.com/KoboldAI/KoboldAI-Client.git
				synced 2025-06-05 21:59:24 +02:00 
			
		
		
		
	Load model directly in fp16 if using GPU or breakmodel
This commit is contained in:
		
							
								
								
									
										15
									
								
								aiserver.py
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								aiserver.py
									
									
									
									
									
								
							| @@ -15,6 +15,7 @@ import json | |||||||
| import collections | import collections | ||||||
| import zipfile | import zipfile | ||||||
| import packaging | import packaging | ||||||
|  | import contextlib | ||||||
| from typing import Any, Union, Dict, Set, List | from typing import Any, Union, Dict, Set, List | ||||||
|  |  | ||||||
| import requests | import requests | ||||||
| @@ -710,10 +711,21 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme | |||||||
|                 return {} |                 return {} | ||||||
|             return {"low_cpu_mem_usage": True} |             return {"low_cpu_mem_usage": True} | ||||||
|          |          | ||||||
|  |         @contextlib.contextmanager | ||||||
|  |         def maybe_use_float16(always_use=False): | ||||||
|  |             if(always_use or (vars.hascuda and (vars.usegpu or vars.breakmodel))): | ||||||
|  |                 original_dtype = torch.get_default_dtype() | ||||||
|  |                 torch.set_default_dtype(torch.float16) | ||||||
|  |                 yield True | ||||||
|  |                 torch.set_default_dtype(original_dtype) | ||||||
|  |             else: | ||||||
|  |                 yield False | ||||||
|  |  | ||||||
|         # If custom GPT Neo model was chosen |         # If custom GPT Neo model was chosen | ||||||
|         if(vars.model == "NeoCustom"): |         if(vars.model == "NeoCustom"): | ||||||
|             model_config = open(vars.custmodpth + "/config.json", "r") |             model_config = open(vars.custmodpth + "/config.json", "r") | ||||||
|             js   = json.load(model_config) |             js   = json.load(model_config) | ||||||
|  |             with(maybe_use_float16()): | ||||||
|                 if("model_type" in js): |                 if("model_type" in js): | ||||||
|                     model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage()) |                     model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage()) | ||||||
|                 else: |                 else: | ||||||
| @@ -735,6 +747,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme | |||||||
|         elif(vars.model == "GPT2Custom"): |         elif(vars.model == "GPT2Custom"): | ||||||
|             model_config = open(vars.custmodpth + "/config.json", "r") |             model_config = open(vars.custmodpth + "/config.json", "r") | ||||||
|             js   = json.load(model_config) |             js   = json.load(model_config) | ||||||
|  |             with(maybe_use_float16()): | ||||||
|                 model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage()) |                 model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage()) | ||||||
|             tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage()) |             tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage()) | ||||||
|             vars.modeldim = get_hidden_size_from_model(model) |             vars.modeldim = get_hidden_size_from_model(model) | ||||||
| @@ -750,11 +763,13 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme | |||||||
|             tokenizer = GPT2Tokenizer.from_pretrained(vars.model, cache_dir="cache/") |             tokenizer = GPT2Tokenizer.from_pretrained(vars.model, cache_dir="cache/") | ||||||
|             if(vars.hascuda): |             if(vars.hascuda): | ||||||
|                 if(vars.usegpu): |                 if(vars.usegpu): | ||||||
|  |                     with(maybe_use_float16()): | ||||||
|                         model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage()) |                         model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage()) | ||||||
|                     vars.modeldim = get_hidden_size_from_model(model) |                     vars.modeldim = get_hidden_size_from_model(model) | ||||||
|                     model = model.half().to(0) |                     model = model.half().to(0) | ||||||
|                     generator = model.generate |                     generator = model.generate | ||||||
|                 elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel) |                 elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel) | ||||||
|  |                     with(maybe_use_float16()): | ||||||
|                         model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage()) |                         model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage()) | ||||||
|                     vars.modeldim = get_hidden_size_from_model(model) |                     vars.modeldim = get_hidden_size_from_model(model) | ||||||
|                     device_config(model) |                     device_config(model) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Gnome Ann
					Gnome Ann