mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Maybe works now...?
This commit is contained in:
@@ -71,6 +71,9 @@ class model_backend(HFTorchInferenceModel):
|
||||
)
|
||||
|
||||
if self.lazy_load:
|
||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
|
||||
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
||||
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
||||
try:
|
||||
@@ -83,145 +86,92 @@ class model_backend(HFTorchInferenceModel):
|
||||
self.lazy_load = False
|
||||
|
||||
# Download model from Huggingface if it does not exist, otherwise load locally
|
||||
with self._maybe_use_float16(), lazy_loader.use_lazy_load(
|
||||
enable=self.lazy_load,
|
||||
callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
|
||||
if self.lazy_load
|
||||
else None,
|
||||
dematerialized_modules=True,
|
||||
):
|
||||
if self.lazy_load:
|
||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
if self.get_local_model_path():
|
||||
# Model is stored locally, load it.
|
||||
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
||||
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
||||
else:
|
||||
# Model not stored locally, we need to download it.
|
||||
|
||||
if self.get_local_model_path():
|
||||
# Model is stored locally, load it.
|
||||
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
||||
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
||||
else:
|
||||
# Model not stored locally, we need to download it.
|
||||
# _rebuild_tensor patch for casting dtype and supporting LazyTensors
|
||||
old_rebuild_tensor = torch._utils._rebuild_tensor
|
||||
|
||||
# _rebuild_tensor patch for casting dtype and supporting LazyTensors
|
||||
old_rebuild_tensor = torch._utils._rebuild_tensor
|
||||
def new_rebuild_tensor(
|
||||
storage: Union[lazy_loader.LazyTensor, torch.Storage],
|
||||
storage_offset,
|
||||
shape,
|
||||
stride,
|
||||
):
|
||||
if not isinstance(storage, lazy_loader.LazyTensor):
|
||||
dtype = storage.dtype
|
||||
else:
|
||||
dtype = storage.storage_type.dtype
|
||||
if not isinstance(dtype, torch.dtype):
|
||||
dtype = storage.storage_type(0).dtype
|
||||
if dtype is torch.float32 and len(shape) >= 2:
|
||||
utils.koboldai_vars.fp32_model = True
|
||||
return old_rebuild_tensor(storage, storage_offset, shape, stride)
|
||||
|
||||
def new_rebuild_tensor(
|
||||
storage: Union[lazy_loader.LazyTensor, torch.Storage],
|
||||
storage_offset,
|
||||
shape,
|
||||
stride,
|
||||
):
|
||||
if not isinstance(storage, lazy_loader.LazyTensor):
|
||||
dtype = storage.dtype
|
||||
else:
|
||||
dtype = storage.storage_type.dtype
|
||||
if not isinstance(dtype, torch.dtype):
|
||||
dtype = storage.storage_type(0).dtype
|
||||
if dtype is torch.float32 and len(shape) >= 2:
|
||||
utils.koboldai_vars.fp32_model = True
|
||||
return old_rebuild_tensor(storage, storage_offset, shape, stride)
|
||||
torch._utils._rebuild_tensor = new_rebuild_tensor
|
||||
self.model = self._get_model(self.model_name, tf_kwargs)
|
||||
self.tokenizer = self._get_tokenizer(self.model_name)
|
||||
torch._utils._rebuild_tensor = old_rebuild_tensor
|
||||
|
||||
torch._utils._rebuild_tensor = new_rebuild_tensor
|
||||
self.model = self._get_model(self.model_name, tf_kwargs)
|
||||
self.tokenizer = self._get_tokenizer(self.model_name)
|
||||
torch._utils._rebuild_tensor = old_rebuild_tensor
|
||||
if save_model:
|
||||
self.tokenizer.save_pretrained(
|
||||
self.get_local_model_path(ignore_existance=True)
|
||||
)
|
||||
|
||||
if save_model:
|
||||
self.tokenizer.save_pretrained(
|
||||
self.get_local_model_path(ignore_existance=True)
|
||||
if utils.koboldai_vars.fp32_model:
|
||||
# Use save_pretrained to convert fp32 models to fp16,
|
||||
# unless we are using disk cache because save_pretrained
|
||||
# is not supported in that case
|
||||
self.model = self.model.half()
|
||||
self.model.save_pretrained(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
max_shard_size="500MiB",
|
||||
)
|
||||
|
||||
if utils.koboldai_vars.fp32_model:
|
||||
# Use save_pretrained to convert fp32 models to fp16,
|
||||
# unless we are using disk cache because save_pretrained
|
||||
# is not supported in that case
|
||||
self.model = self.model.half()
|
||||
self.model.save_pretrained(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
max_shard_size="500MiB",
|
||||
)
|
||||
else:
|
||||
# For fp16 models, we can just copy the model files directly
|
||||
import transformers.configuration_utils
|
||||
import transformers.modeling_utils
|
||||
import transformers.file_utils
|
||||
import huggingface_hub
|
||||
|
||||
else:
|
||||
# For fp16 models, we can just copy the model files directly
|
||||
import transformers.configuration_utils
|
||||
import transformers.modeling_utils
|
||||
import transformers.file_utils
|
||||
import huggingface_hub
|
||||
|
||||
# Save the config.json
|
||||
shutil.move(
|
||||
os.path.realpath(
|
||||
huggingface_hub.hf_hub_download(
|
||||
self.model_name,
|
||||
transformers.configuration_utils.CONFIG_NAME,
|
||||
revision=utils.koboldai_vars.revision,
|
||||
cache_dir="cache",
|
||||
local_files_only=True,
|
||||
legacy_cache_layout=False,
|
||||
)
|
||||
),
|
||||
os.path.join(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
# Save the config.json
|
||||
shutil.move(
|
||||
os.path.realpath(
|
||||
huggingface_hub.hf_hub_download(
|
||||
self.model_name,
|
||||
transformers.configuration_utils.CONFIG_NAME,
|
||||
),
|
||||
)
|
||||
|
||||
if utils.num_shards is None:
|
||||
# Save the pytorch_model.bin or model.safetensors of an unsharded model
|
||||
any_success = False
|
||||
possible_checkpoint_names = [
|
||||
transformers.modeling_utils.WEIGHTS_NAME,
|
||||
"model.safetensors",
|
||||
]
|
||||
|
||||
for possible_checkpoint_name in possible_checkpoint_names:
|
||||
try:
|
||||
shutil.move(
|
||||
os.path.realpath(
|
||||
huggingface_hub.hf_hub_download(
|
||||
self.model_name,
|
||||
possible_checkpoint_name,
|
||||
revision=utils.koboldai_vars.revision,
|
||||
cache_dir="cache",
|
||||
local_files_only=True,
|
||||
legacy_cache_layout=False,
|
||||
)
|
||||
),
|
||||
os.path.join(
|
||||
self.get_local_model_path(
|
||||
ignore_existance=True
|
||||
),
|
||||
possible_checkpoint_name,
|
||||
),
|
||||
)
|
||||
any_success = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not any_success:
|
||||
raise RuntimeError(
|
||||
f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
|
||||
)
|
||||
else:
|
||||
# Handle saving sharded models
|
||||
|
||||
with open(utils.from_pretrained_index_filename) as f:
|
||||
map_data = json.load(f)
|
||||
filenames = set(map_data["weight_map"].values())
|
||||
# Save the pytorch_model.bin.index.json of a sharded model
|
||||
shutil.move(
|
||||
os.path.realpath(utils.from_pretrained_index_filename),
|
||||
os.path.join(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
transformers.modeling_utils.WEIGHTS_INDEX_NAME,
|
||||
),
|
||||
revision=utils.koboldai_vars.revision,
|
||||
cache_dir="cache",
|
||||
local_files_only=True,
|
||||
legacy_cache_layout=False,
|
||||
)
|
||||
# Then save the pytorch_model-#####-of-#####.bin files
|
||||
for filename in filenames:
|
||||
),
|
||||
os.path.join(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
transformers.configuration_utils.CONFIG_NAME,
|
||||
),
|
||||
)
|
||||
|
||||
if utils.num_shards is None:
|
||||
# Save the pytorch_model.bin or model.safetensors of an unsharded model
|
||||
any_success = False
|
||||
possible_checkpoint_names = [
|
||||
transformers.modeling_utils.WEIGHTS_NAME,
|
||||
"model.safetensors",
|
||||
]
|
||||
|
||||
for possible_checkpoint_name in possible_checkpoint_names:
|
||||
try:
|
||||
shutil.move(
|
||||
os.path.realpath(
|
||||
huggingface_hub.hf_hub_download(
|
||||
self.model_name,
|
||||
filename,
|
||||
possible_checkpoint_name,
|
||||
revision=utils.koboldai_vars.revision,
|
||||
cache_dir="cache",
|
||||
local_files_only=True,
|
||||
@@ -232,13 +182,53 @@ class model_backend(HFTorchInferenceModel):
|
||||
self.get_local_model_path(
|
||||
ignore_existance=True
|
||||
),
|
||||
filename,
|
||||
possible_checkpoint_name,
|
||||
),
|
||||
)
|
||||
shutil.rmtree("cache/")
|
||||
any_success = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not any_success:
|
||||
raise RuntimeError(
|
||||
f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
|
||||
)
|
||||
else:
|
||||
# Handle saving sharded models
|
||||
|
||||
with open(utils.from_pretrained_index_filename) as f:
|
||||
map_data = json.load(f)
|
||||
filenames = set(map_data["weight_map"].values())
|
||||
# Save the pytorch_model.bin.index.json of a sharded model
|
||||
shutil.move(
|
||||
os.path.realpath(utils.from_pretrained_index_filename),
|
||||
os.path.join(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
transformers.modeling_utils.WEIGHTS_INDEX_NAME,
|
||||
),
|
||||
)
|
||||
# Then save the pytorch_model-#####-of-#####.bin files
|
||||
for filename in filenames:
|
||||
shutil.move(
|
||||
os.path.realpath(
|
||||
huggingface_hub.hf_hub_download(
|
||||
self.model_name,
|
||||
filename,
|
||||
revision=utils.koboldai_vars.revision,
|
||||
cache_dir="cache",
|
||||
local_files_only=True,
|
||||
legacy_cache_layout=False,
|
||||
)
|
||||
),
|
||||
os.path.join(
|
||||
self.get_local_model_path(ignore_existance=True),
|
||||
filename,
|
||||
),
|
||||
)
|
||||
shutil.rmtree("cache/")
|
||||
|
||||
self.patch_embedding()
|
||||
self.model.tie_weights()
|
||||
# self.model.tie_weights()
|
||||
|
||||
self.model.kai_model = self
|
||||
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
||||
|
Reference in New Issue
Block a user