From d152befe54e0f89ce44e7e6d10e1f7e6c6464c09 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Wed, 13 Dec 2023 20:23:19 +0300 Subject: [PATCH 1/8] IPEX Torch 2.1 --- environments/ipex.yml | 6 ++-- modeling/ipex/__init__.py | 33 +++++++---------- modeling/ipex/attention.py | 29 +++++---------- modeling/ipex/diffusers.py | 2 +- modeling/ipex/gradscaler.py | 6 +++- modeling/ipex/hijacks.py | 71 ++++++++++++++++++++++++++----------- 6 files changed, 80 insertions(+), 67 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index 1eac5915..d322697b 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -25,10 +25,8 @@ dependencies: - ffmpeg - pip: - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - - torch==2.0.1a0; sys_platform == 'linux' - - torch==2.0.0a0; sys_platform == 'win32' - - intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux' - - intel_extension_for_pytorch==2.0.110+gitba7f6c1; sys_platform == 'win32' + - torch==2.1.0a0 + - intel-extension-for-pytorch==2.1.10+xpu - openvino - onnxruntime-openvino - flask-cloudflared==0.0.10 diff --git a/modeling/ipex/__init__.py b/modeling/ipex/__init__.py index dc1985ed..c7854791 100644 --- a/modeling/ipex/__init__.py +++ b/modeling/ipex/__init__.py @@ -4,13 +4,12 @@ import contextlib import torch import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import from .hijacks import ipex_hijacks -from .attention import attention_init # pylint: disable=protected-access, missing-function-docstring, line-too-long def ipex_init(): # pylint: disable=too-many-statements try: - #Replace cuda with xpu: + # Replace cuda with xpu: torch.cuda.current_device = torch.xpu.current_device torch.cuda.current_stream = torch.xpu.current_stream torch.cuda.device = torch.xpu.device @@ -91,9 +90,9 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.CharStorage = torch.xpu.CharStorage torch.cuda.__file__ = torch.xpu.__file__ torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork - #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing + # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing - #Memory: + # Memory: torch.cuda.memory = torch.xpu.memory if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read(): torch.xpu.empty_cache = lambda: None @@ -113,7 +112,7 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats - #RNG: + # RNG: torch.cuda.get_rng_state = torch.xpu.get_rng_state torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all torch.cuda.set_rng_state = torch.xpu.set_rng_state @@ -124,7 +123,7 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.seed_all = torch.xpu.seed_all torch.cuda.initial_seed = torch.xpu.initial_seed - #AMP: + # AMP: torch.cuda.amp = torch.xpu.amp if not hasattr(torch.cuda.amp, "common"): torch.cuda.amp.common = contextlib.nullcontext() @@ -139,12 +138,12 @@ def ipex_init(): # pylint: disable=too-many-statements except Exception: # pylint: disable=broad-exception-caught torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler - #C + # C torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream ipex._C._DeviceProperties.major = 2023 ipex._C._DeviceProperties.minor = 2 - #Fix functions with ipex: + # Fix functions with ipex: torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory] torch._utils._get_available_device_type = lambda: "xpu" torch.has_cuda = True @@ -157,20 +156,14 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.get_device_properties.minor = 7 torch.cuda.ipc_collect = lambda *args, **kwargs: None torch.cuda.utilization = lambda *args, **kwargs: 0 - if hasattr(torch.xpu, 'getDeviceIdListForCard'): - torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard - torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard - else: - torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card - torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card ipex_hijacks() - attention_init() - try: - from .diffusers import ipex_diffusers - ipex_diffusers() - except Exception: # pylint: disable=broad-exception-caught - pass + if not torch.xpu.has_fp64_dtype(): + try: + from .diffusers import ipex_diffusers + ipex_diffusers() + except Exception: # pylint: disable=broad-exception-caught + pass except Exception as e: return False, e return True, None diff --git a/modeling/ipex/attention.py b/modeling/ipex/attention.py index 52016466..ced59637 100644 --- a/modeling/ipex/attention.py +++ b/modeling/ipex/attention.py @@ -4,11 +4,8 @@ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unuse # pylint: disable=protected-access, missing-function-docstring, line-too-long original_torch_bmm = torch.bmm -def torch_bmm(input, mat2, *, out=None): - if input.dtype != mat2.dtype: - mat2 = mat2.to(input.dtype) - - #ARC GPUs can't allocate more than 4GB to a single block, Slice it: +def torch_bmm_32_bit(input, mat2, *, out=None): + # ARC GPUs can't allocate more than 4GB to a single block, Slice it: batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2] block_multiply = input.element_size() slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply @@ -17,7 +14,7 @@ def torch_bmm(input, mat2, *, out=None): split_slice_size = batch_size_attention if block_size > 4: do_split = True - #Find something divisible with the input_tokens + # Find something divisible with the input_tokens while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: @@ -30,7 +27,7 @@ def torch_bmm(input, mat2, *, out=None): if split_slice_size * slice_block_size > 4: slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply do_split_2 = True - #Find something divisible with the input_tokens + # Find something divisible with the input_tokens while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: @@ -64,8 +61,8 @@ def torch_bmm(input, mat2, *, out=None): return hidden_states original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention -def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False): - #ARC GPUs can't allocate more than 4GB to a single block, Slice it: +def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False): + # ARC GPUs can't allocate more than 4GB to a single block, Slice it: if len(query.shape) == 3: batch_size_attention, query_tokens, shape_four = query.shape shape_one = 1 @@ -74,11 +71,6 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. shape_one, batch_size_attention, query_tokens, shape_four = query.shape no_shape_one = False - if query.dtype != key.dtype: - key = key.to(dtype=query.dtype) - if query.dtype != value.dtype: - value = value.to(dtype=query.dtype) - block_multiply = query.element_size() slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply block_size = batch_size_attention * slice_block_size @@ -86,7 +78,7 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. split_slice_size = batch_size_attention if block_size > 4: do_split = True - #Find something divisible with the shape_one + # Find something divisible with the shape_one while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: @@ -99,7 +91,7 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. if split_slice_size * slice_block_size > 4: slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply do_split_2 = True - #Find something divisible with the batch_size_attention + # Find something divisible with the batch_size_attention while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: @@ -155,8 +147,3 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal ) return hidden_states - -def attention_init(): - #ARC GPUs can't allocate more than 4GB to a single block: - torch.bmm = torch_bmm - torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention diff --git a/modeling/ipex/diffusers.py b/modeling/ipex/diffusers.py index 005ee49f..c32af507 100644 --- a/modeling/ipex/diffusers.py +++ b/modeling/ipex/diffusers.py @@ -1,6 +1,6 @@ import torch import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import -import diffusers #0.21.1 # pylint: disable=import-error +import diffusers #0.24.0 # pylint: disable=import-error from diffusers.models.attention_processor import Attention # pylint: disable=protected-access, missing-function-docstring, line-too-long diff --git a/modeling/ipex/gradscaler.py b/modeling/ipex/gradscaler.py index 53021210..6eb56bc2 100644 --- a/modeling/ipex/gradscaler.py +++ b/modeling/ipex/gradscaler.py @@ -5,6 +5,7 @@ import intel_extension_for_pytorch._C as core # pylint: disable=import-error, un # pylint: disable=protected-access, missing-function-docstring, line-too-long +device_supports_fp64 = torch.xpu.has_fp64_dtype() OptState = ipex.cpu.autocast._grad_scaler.OptState _MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator _refresh_per_optimizer_state = ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state @@ -96,7 +97,10 @@ def unscale_(self, optimizer): # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64. assert self._scale is not None - inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device) + if device_supports_fp64: + inv_scale = self._scale.double().reciprocal().float() + else: + inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device) found_inf = torch.full( (1,), 0.0, dtype=torch.float32, device=self._scale.device ) diff --git a/modeling/ipex/hijacks.py b/modeling/ipex/hijacks.py index 88827837..fdb8dea9 100644 --- a/modeling/ipex/hijacks.py +++ b/modeling/ipex/hijacks.py @@ -92,7 +92,7 @@ def ipex_autocast(*args, **kwargs): else: return original_autocast(*args, **kwargs) -#Embedding BF16 +# Embedding BF16 original_torch_cat = torch.cat def torch_cat(tensor, *args, **kwargs): if len(tensor) == 3 and (tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype): @@ -100,7 +100,7 @@ def torch_cat(tensor, *args, **kwargs): else: return original_torch_cat(tensor, *args, **kwargs) -#Latent antialias: +# Latent antialias: original_interpolate = torch.nn.functional.interpolate def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments if antialias or align_corners is not None: @@ -120,6 +120,32 @@ def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name else: return original_linalg_solve(A, B, *args, **kwargs) +if torch.xpu.has_fp64_dtype(): + original_torch_bmm = torch.bmm + original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention +else: + # 64 bit attention workarounds for Alchemist: + try: + from .attention import torch_bmm_32_bit as original_torch_bmm + from .attention import scaled_dot_product_attention_32_bit as original_scaled_dot_product_attention + except Exception: # pylint: disable=broad-exception-caught + original_torch_bmm = torch.bmm + original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention + +# dtype errors: +def torch_bmm(input, mat2, *, out=None): + if input.dtype != mat2.dtype: + mat2 = mat2.to(input.dtype) + return original_torch_bmm(input, mat2, out=out) + +def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False): + if query.dtype != key.dtype: + key = key.to(dtype=query.dtype) + if query.dtype != value.dtype: + value = value.to(dtype=query.dtype) + return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal) + +@property def is_cuda(self): return self.device.type == 'xpu' @@ -158,12 +184,12 @@ def ipex_hijacks(): lambda orig_func, f, map_location=None, pickle_module=None, *, weights_only=False, mmap=None, **kwargs: orig_func(orig_func, f, map_location=return_xpu(map_location), pickle_module=pickle_module, weights_only=weights_only, mmap=mmap, **kwargs), lambda orig_func, f, map_location=None, pickle_module=None, *, weights_only=False, mmap=None, **kwargs: check_device(map_location)) + if hasattr(torch.xpu, "Generator"): + CondFunc('torch.Generator', + lambda orig_func, device=None: torch.xpu.Generator(return_xpu(device)), + lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu") - CondFunc('torch.Generator', - lambda orig_func, device=None: torch.xpu.Generator(return_xpu(device)), - lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu") - - #TiledVAE and ControlNet: + # TiledVAE and ControlNet: CondFunc('torch.batch_norm', lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input, weight if weight is not None else torch.ones(input.size()[1], device=input.device), @@ -175,46 +201,51 @@ def ipex_hijacks(): bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs), lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu")) - #Functions with dtype errors: + # Functions with dtype errors: CondFunc('torch.nn.modules.GroupNorm.forward', lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)), lambda orig_func, self, input: input.dtype != self.weight.data.dtype) - #Training: + # Training: CondFunc('torch.nn.modules.linear.Linear.forward', lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)), lambda orig_func, self, input: input.dtype != self.weight.data.dtype) CondFunc('torch.nn.modules.conv.Conv2d.forward', lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)), lambda orig_func, self, input: input.dtype != self.weight.data.dtype) - #BF16: + # BF16: CondFunc('torch.nn.functional.layer_norm', lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs), lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight is not None and input.dtype != weight.data.dtype) - #SwinIR BF16: + # SwinIR BF16: CondFunc('torch.nn.functional.pad', lambda orig_func, input, pad, mode='constant', value=None: orig_func(input.to(torch.float32), pad, mode=mode, value=value).to(dtype=torch.bfloat16), lambda orig_func, input, pad, mode='constant', value=None: mode == 'reflect' and input.dtype == torch.bfloat16) - #Diffusers Float64 (ARC GPUs doesn't support double or Float64): + # Diffusers Float64 (Alchemist GPUs doesn't support 64 bit): if not torch.xpu.has_fp64_dtype(): CondFunc('torch.from_numpy', lambda orig_func, ndarray: orig_func(ndarray.astype('float32')), lambda orig_func, ndarray: ndarray.dtype == float) - #Broken functions when torch.cuda.is_available is True: - #Pin Memory: + # Broken functions when torch.cuda.is_available is True: + # Pin Memory: CondFunc('torch.utils.data.dataloader._BaseDataLoaderIter.__init__', lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs), lambda orig_func, *args, **kwargs: True) - #Functions that make compile mad with CondFunc: - torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = _shutdown_workers + # Functions that make compile mad with CondFunc: torch.nn.DataParallel = DummyDataParallel + torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = _shutdown_workers + torch.autocast = ipex_autocast - torch.cat = torch_cat - torch.linalg.solve = linalg_solve - torch.UntypedStorage.is_cuda = is_cuda - torch.nn.functional.interpolate = interpolate torch.backends.cuda.sdp_kernel = return_null_context + torch.UntypedStorage.is_cuda = is_cuda + + torch.nn.functional.interpolate = interpolate + torch.linalg.solve = linalg_solve + + torch.bmm = torch_bmm + torch.cat = torch_cat + torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention From d8e90132a53fe79282f0feb25863805c6e58486e Mon Sep 17 00:00:00 2001 From: Disty0 Date: Thu, 14 Dec 2023 12:48:22 +0300 Subject: [PATCH 2/8] IPEX bundle in MKL and DPCPP --- environments/ipex.yml | 2 ++ play-ipex.sh | 12 +----------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index d322697b..49dcaffd 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -27,6 +27,8 @@ dependencies: - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - torch==2.1.0a0 - intel-extension-for-pytorch==2.1.10+xpu + - mkl==2024.0.0 + - mkl-dpcpp==2024.0.0 - openvino - onnxruntime-openvino - flask-cloudflared==0.0.10 diff --git a/play-ipex.sh b/play-ipex.sh index eb6ecc29..ce5b9365 100755 --- a/play-ipex.sh +++ b/play-ipex.sh @@ -4,17 +4,7 @@ if [ ! -f "runtime/envs/koboldai-ipex/bin/python" ]; then ./install_requirements.sh ipex fi -#Set OneAPI environmet if it's not set by the user -if [ ! -x "$(command -v sycl-ls)" ] -then - echo "Setting OneAPI environment" - if [[ -z "$ONEAPI_ROOT" ]] - then - ONEAPI_ROOT=/opt/intel/oneapi - fi - source $ONEAPI_ROOT/setvars.sh -fi - +export LD_LIBRARY_PATH=$(realpath "runtime/envs/koboldai-ipex")/lib/:$LD_LIBRARY_PATH export LD_PRELOAD=/usr/lib/libstdc++.so export NEOReadDebugKeys=1 export ClDeviceGlobalMemSizeAvailablePercent=100 From 1764ec784f1c38b782669411d8642b2c688affe1 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Thu, 14 Dec 2023 13:06:18 +0300 Subject: [PATCH 3/8] IPEX remove not needed packages --- environments/ipex.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index 49dcaffd..5bbbc49c 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -37,10 +37,8 @@ dependencies: - Werkzeug==2.3.7 - lupa==1.10 - transformers[sentencepiece]==4.34.0 - - intel-extension-for-transformers - huggingface_hub==0.16.4 - optimum[onnxruntime]==1.13.2 - - optimum-intel - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra From b09d1c0e6f27bf7881e692143c9ad915e2bc3fba Mon Sep 17 00:00:00 2001 From: Disty0 Date: Fri, 15 Dec 2023 12:58:06 +0300 Subject: [PATCH 4/8] IPEX add ipex.optimize_transformers and DeepSpeed support --- environments/ipex.yml | 4 ++++ modeling/inference_models/hf_torch.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/environments/ipex.yml b/environments/ipex.yml index 5bbbc49c..55629d04 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -29,6 +29,10 @@ dependencies: - intel-extension-for-pytorch==2.1.10+xpu - mkl==2024.0.0 - mkl-dpcpp==2024.0.0 + - oneccl-bind-pt==2.1.100+xpu; sys_platform == 'linux' + - impi-devel==2021.11.0; sys_platform == 'linux' + - oneccl-devel==2021.11.1; sys_platform == 'linux' + - deepspeed; sys_platform == 'linux' - openvino - onnxruntime-openvino - flask-cloudflared==0.0.10 diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index fcdd9fb9..cda44e99 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -423,6 +423,10 @@ class HFTorchInferenceModel(HFInferenceModel): torch_dtype=self._get_target_dtype(), **tf_kwargs, ) + + if hasattr(torch, "xpu") and torch.xpu.is_available and os.environ.get('DISABLE_IPEX_OPTIMIZE', None) is None: + import intel_extension_for_pytorch as ipex + model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) except Exception as e: # ...but fall back to stock HF if lazyloader fails. if utils.args.panic: @@ -439,6 +443,10 @@ class HFTorchInferenceModel(HFInferenceModel): **tf_kwargs, ) + if hasattr(torch, "xpu") and torch.xpu.is_available and os.environ.get('DISABLE_IPEX_OPTIMIZE', None) is None: + import intel_extension_for_pytorch as ipex + model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) + if not self.lazy_load and not self.breakmodel: # We need to move the model to the desired device if (not self.usegpu) or torch.cuda.device_count() <= 0: From c1ae1d73416150552268e0455ea92ee78284c1e3 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Fri, 15 Dec 2023 14:08:38 +0300 Subject: [PATCH 5/8] Revert bundle-in MKL and DPCPP --- environments/ipex.yml | 4 ---- modeling/inference_models/hf_torch.py | 12 ++++++++++-- play-ipex.sh | 12 +++++++++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index 55629d04..ff8776a6 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -27,11 +27,7 @@ dependencies: - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - torch==2.1.0a0 - intel-extension-for-pytorch==2.1.10+xpu - - mkl==2024.0.0 - - mkl-dpcpp==2024.0.0 - oneccl-bind-pt==2.1.100+xpu; sys_platform == 'linux' - - impi-devel==2021.11.0; sys_platform == 'linux' - - oneccl-devel==2021.11.1; sys_platform == 'linux' - deepspeed; sys_platform == 'linux' - openvino - onnxruntime-openvino diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index cda44e99..5f85a1a2 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -426,7 +426,11 @@ class HFTorchInferenceModel(HFInferenceModel): if hasattr(torch, "xpu") and torch.xpu.is_available and os.environ.get('DISABLE_IPEX_OPTIMIZE', None) is None: import intel_extension_for_pytorch as ipex - model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) + model = model.to(memory_format=torch.channels_last) + if hasattr(ipex, "optimize_transformers"): + model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) + else: + model = ipex.optimize(model.eval(), dtype=torch.float16, inplace=True) except Exception as e: # ...but fall back to stock HF if lazyloader fails. if utils.args.panic: @@ -445,7 +449,11 @@ class HFTorchInferenceModel(HFInferenceModel): if hasattr(torch, "xpu") and torch.xpu.is_available and os.environ.get('DISABLE_IPEX_OPTIMIZE', None) is None: import intel_extension_for_pytorch as ipex - model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) + model = model.to(memory_format=torch.channels_last) + if hasattr(ipex, "optimize_transformers"): + model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) + else: + model = ipex.optimize(model.eval(), dtype=torch.float16, inplace=True) if not self.lazy_load and not self.breakmodel: # We need to move the model to the desired device diff --git a/play-ipex.sh b/play-ipex.sh index ce5b9365..eb6ecc29 100755 --- a/play-ipex.sh +++ b/play-ipex.sh @@ -4,7 +4,17 @@ if [ ! -f "runtime/envs/koboldai-ipex/bin/python" ]; then ./install_requirements.sh ipex fi -export LD_LIBRARY_PATH=$(realpath "runtime/envs/koboldai-ipex")/lib/:$LD_LIBRARY_PATH +#Set OneAPI environmet if it's not set by the user +if [ ! -x "$(command -v sycl-ls)" ] +then + echo "Setting OneAPI environment" + if [[ -z "$ONEAPI_ROOT" ]] + then + ONEAPI_ROOT=/opt/intel/oneapi + fi + source $ONEAPI_ROOT/setvars.sh +fi + export LD_PRELOAD=/usr/lib/libstdc++.so export NEOReadDebugKeys=1 export ClDeviceGlobalMemSizeAvailablePercent=100 From 9dbb556cc1c2dafd87c2df8a4dc6cf64dcf1f503 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Fri, 15 Dec 2023 14:42:27 +0300 Subject: [PATCH 6/8] Revert ipex.optimize_transformers --- environments/ipex.yml | 4 ---- modeling/inference_models/hf_torch.py | 16 ---------------- 2 files changed, 20 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index ff8776a6..6d7978b2 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -27,8 +27,6 @@ dependencies: - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - torch==2.1.0a0 - intel-extension-for-pytorch==2.1.10+xpu - - oneccl-bind-pt==2.1.100+xpu; sys_platform == 'linux' - - deepspeed; sys_platform == 'linux' - openvino - onnxruntime-openvino - flask-cloudflared==0.0.10 @@ -57,8 +55,6 @@ dependencies: - einops - peft==0.3.0 - scipy - - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - windows-curses; sys_platform == 'win32' - pynvml - omegaconf \ No newline at end of file diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 5f85a1a2..fcdd9fb9 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -423,14 +423,6 @@ class HFTorchInferenceModel(HFInferenceModel): torch_dtype=self._get_target_dtype(), **tf_kwargs, ) - - if hasattr(torch, "xpu") and torch.xpu.is_available and os.environ.get('DISABLE_IPEX_OPTIMIZE', None) is None: - import intel_extension_for_pytorch as ipex - model = model.to(memory_format=torch.channels_last) - if hasattr(ipex, "optimize_transformers"): - model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) - else: - model = ipex.optimize(model.eval(), dtype=torch.float16, inplace=True) except Exception as e: # ...but fall back to stock HF if lazyloader fails. if utils.args.panic: @@ -447,14 +439,6 @@ class HFTorchInferenceModel(HFInferenceModel): **tf_kwargs, ) - if hasattr(torch, "xpu") and torch.xpu.is_available and os.environ.get('DISABLE_IPEX_OPTIMIZE', None) is None: - import intel_extension_for_pytorch as ipex - model = model.to(memory_format=torch.channels_last) - if hasattr(ipex, "optimize_transformers"): - model = ipex.optimize_transformers(model.eval(), dtype=torch.float16, device="xpu", inplace=True) - else: - model = ipex.optimize(model.eval(), dtype=torch.float16, inplace=True) - if not self.lazy_load and not self.breakmodel: # We need to move the model to the desired device if (not self.usegpu) or torch.cuda.device_count() <= 0: From 879e964d93177e0fecd8020e9e3ec830799728f6 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Mon, 18 Dec 2023 13:59:25 +0300 Subject: [PATCH 7/8] IPEX upgrade Python 3.10 and add Windows wheels --- environments/ipex.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index 6d7978b2..6aab2df6 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -8,7 +8,7 @@ dependencies: - flask-socketio=5.3.2 - flask-session=0.5.0 - python-socketio=5.7.2 - - python=3.8.* + - python=3.10.* - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -24,21 +24,23 @@ dependencies: - psutil - ffmpeg - pip: + - https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp310-cp310-win_amd64.whl; sys_platform == 'win32' + - https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp310-cp310-win_amd64.whl; sys_platform == 'win32' - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - - torch==2.1.0a0 - - intel-extension-for-pytorch==2.1.10+xpu + - torch==2.1.0a0; sys_platform == 'linux' + - intel-extension-for-pytorch==2.1.10+xpu; sys_platform == 'linux' - openvino - onnxruntime-openvino - flask-cloudflared==0.0.10 - flask-ngrok - flask-cors - Werkzeug==2.3.7 - - lupa==1.10 - - transformers[sentencepiece]==4.34.0 - - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.13.2 - - safetensors==0.3.3 - - accelerate==0.21.0 + - lupa==1.12 + - transformers[sentencepiece]==4.33.3 + - huggingface_hub==0.19.4 + - optimum[onnxruntime]==1.16.1 + - safetensors==0.4.1 + - accelerate==0.25.0 - git+https://github.com/VE-FORBRYDERNE/mkultra - flask-session - ansi2html @@ -48,12 +50,10 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.5.1%2Bcu118-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; sys_platform == 'linux' + - https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.5.1%2Bcu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' - einops - - peft==0.3.0 + - peft==0.7.1 - scipy - windows-curses; sys_platform == 'win32' - pynvml From 59918a7dd2404c46fb0926fff7ccfaeeb461fb18 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Mon, 18 Dec 2023 17:14:14 +0300 Subject: [PATCH 8/8] Add BigDL LLM backend --- environments/ipex.yml | 2 + modeling/inference_models/hf_bigdl/class.py | 341 ++++++++++++++++++++ play-ipex.sh | 1 - 3 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 modeling/inference_models/hf_bigdl/class.py diff --git a/environments/ipex.yml b/environments/ipex.yml index 6aab2df6..b4111dd8 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -29,6 +29,8 @@ dependencies: - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - torch==2.1.0a0; sys_platform == 'linux' - intel-extension-for-pytorch==2.1.10+xpu; sys_platform == 'linux' + - bigdl-llm + - bigdl_core_xe - openvino - onnxruntime-openvino - flask-cloudflared==0.0.10 diff --git a/modeling/inference_models/hf_bigdl/class.py b/modeling/inference_models/hf_bigdl/class.py new file mode 100644 index 00000000..4966081e --- /dev/null +++ b/modeling/inference_models/hf_bigdl/class.py @@ -0,0 +1,341 @@ +from __future__ import annotations +try: + import os + import json + import shutil + import traceback + from typing import Dict + + import torch + from torch.nn import Embedding + from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME + + import utils + from logger import logger + from modeling.inference_models.hf_torch import HFTorchInferenceModel + + from bigdl.llm.transformers import AutoModelForCausalLM + + load_failed = False +except Exception: + load_failed = True + +model_backend_name = "BigDL LLM" +model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) + +class model_backend(HFTorchInferenceModel): + def __init__(self) -> None: + super().__init__() + self.lazy_load = False + self.nobreakmodel = True + self.disable = load_failed + self.has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) + + def _get_model(self, location: str, tf_kwargs: Dict): + tf_kwargs["revision"] = utils.koboldai_vars.revision + tf_kwargs["cache_dir"] = "cache" + tf_kwargs["load_in_4bit"] = True + + tf_kwargs.pop("low_cpu_mem_usage", None) + + # Try to determine model type from either AutoModel or falling back to legacy + try: + model = AutoModelForCausalLM.from_pretrained( + location, + offload_folder="accelerate-disk-cache", + torch_dtype=self._get_target_dtype(), + **tf_kwargs, + ) + + # We need to move the model to the desired device + if (not self.usegpu) or (torch.cuda.device_count() <= 0 and not self.has_xpu): + model = model.to("cpu") + elif self.has_xpu: + model = model.to("xpu") + else: + model = model.to("cuda") + + return model + except Exception as e: + traceback_string = traceback.format_exc().lower() + + if "out of memory" in traceback_string: + raise RuntimeError( + "One of your GPUs ran out of memory when KoboldAI tried to load your model." + ) + + # Model corrupted or serious loading problem. Stop here. + if "invalid load key" in traceback_string: + logger.error("Invalid load key! Aborting.") + raise + + if utils.args.panic: + raise + + logger.warning(f"Failed to load model: {e}") + logger.debug(traceback.format_exc()) + + # Function to patch transformers to use our soft prompt + def patch_embedding(self) -> None: + if getattr(Embedding, "_koboldai_patch_causallm_model", None): + Embedding._koboldai_patch_causallm_model = self.model + return + + old_embedding_call = Embedding.__call__ + + kai_model = self + + def new_embedding_call(self, input_ids, *args, **kwargs): + # Don't touch embeddings for models other than the core inference model (that's us!) + if ( + Embedding._koboldai_patch_causallm_model.get_input_embeddings() + is not self + ): + return old_embedding_call(self, input_ids, *args, **kwargs) + + assert input_ids is not None + + if utils.koboldai_vars.sp is not None: + shifted_input_ids = input_ids - kai_model.model.vocab_size + + input_ids.clamp_(max=kai_model.model.config.vocab_size - 1) + inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) + + if utils.koboldai_vars.sp is not None: + utils.koboldai_vars.sp = utils.koboldai_vars.sp.to( + inputs_embeds.dtype + ).to(inputs_embeds.device) + inputs_embeds = torch.where( + (shifted_input_ids >= 0)[..., None], + utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)], + inputs_embeds, + ) + + return inputs_embeds + + Embedding.__call__ = new_embedding_call + Embedding._koboldai_patch_causallm_model = self.model + + def is_valid(self, model_name, model_path, menu_path): + base_is_valid = super().is_valid(model_name, model_path, menu_path) + path = False + gen_path = "models/{}".format(model_name.replace('/', '_')) + if model_path is not None and os.path.exists(model_path): + path = model_path + elif os.path.exists(gen_path): + path = gen_path + + fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME] + + return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames) + + def _initialize_model(self): + return + + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + return super().get_requested_parameters(model_name, model_path, menu_path, parameters) + + def set_input_parameters(self, parameters): + super().set_input_parameters(parameters) + self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else False + + def _load(self, save_model: bool, initial_load: bool) -> None: + utils.koboldai_vars.allowsp = True + + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + if self.model_name == "NeoCustom": + self.model_name = os.path.basename(os.path.normpath(self.path)) + utils.koboldai_vars.model = self.model_name + + # If we specify a model and it's in the root directory, we need to move + # it to the models directory (legacy folder structure to new) + if self.get_local_model_path(legacy=True): + shutil.move( + self.get_local_model_path(legacy=True, ignore_existance=True), + self.get_local_model_path(ignore_existance=True), + ) + + self.init_model_config() + + tf_kwargs = { + "low_cpu_mem_usage": True, + "use_cache": True # Workaround for models that accidentally turn cache to false + } + + + if self.model_type == "llama": + tf_kwargs.update({ + "pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted + }) + + logger.debug( + "hasgpu: {}".format( + utils.koboldai_vars.hascuda, + ) + ) + + # Download model from Huggingface if it does not exist, otherwise load locally + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + # Model not stored locally, we need to download it. + + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor + + def new_rebuild_tensor( + storage: torch.Storage, + storage_offset, + shape, + stride, + ): + dtype = storage.dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) + + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(self.model_name, tf_kwargs) + self.tokenizer = self._get_tokenizer(self.model_name) + torch._utils._rebuild_tensor = old_rebuild_tensor + + if save_model: + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) + + if utils.koboldai_vars.fp32_model: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + self.model = self.model.half() + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) + + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub + + # Save the config.json + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + transformers.configuration_utils.CONFIG_NAME, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME, + ), + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + any_success = False + possible_checkpoint_names = [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ] + + for possible_checkpoint_name in possible_checkpoint_names: + try: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + possible_checkpoint_name, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + possible_checkpoint_name, + ), + ) + any_success = True + except Exception: + pass + + if not any_success: + raise RuntimeError( + f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'" + ) + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + filename, + ), + ) + shutil.rmtree("cache/") + + self.patch_embedding() + + self.model.kai_model = self + utils.koboldai_vars.modeldim = self.get_hidden_size() + + def _save_settings(self): + with open( + "settings/{}.hf_bigdl.model_backend.settings".format( + self.model_name.replace("/", "_") + ), + "w", + ) as f: + json.dump( + { + "layers": self.layers if "layers" in vars(self) else [], + "disk_layers": self.disk_layers + if "disk_layers" in vars(self) + else 0, + }, + f, + indent="", + ) diff --git a/play-ipex.sh b/play-ipex.sh index eb6ecc29..07fdb155 100755 --- a/play-ipex.sh +++ b/play-ipex.sh @@ -15,7 +15,6 @@ then source $ONEAPI_ROOT/setvars.sh fi -export LD_PRELOAD=/usr/lib/libstdc++.so export NEOReadDebugKeys=1 export ClDeviceGlobalMemSizeAvailablePercent=100