Merge branch 'concedo_united_PR' of https://github.com/LostRuins/KoboldAI into concedo_united_PR

This commit is contained in:
Concedo
2023-10-06 16:00:42 +08:00
11 changed files with 495 additions and 40 deletions

View File

@@ -1,7 +1,7 @@
name: koboldai
channels:
- pytorch
- nvidia
- nvidia/label/cuda-11.8.0
- conda-forge
- defaults
dependencies:
@@ -13,6 +13,8 @@ dependencies:
- pytorch=2.0.*
- python=3.8.*
- pytorch-cuda=11.8
- cuda-nvcc=11.8
- cuda-libraries-dev=11.8
- eventlet=0.33.3
- dnspython=2.2.1
- markdown
@@ -31,9 +33,9 @@ dependencies:
- flask-ngrok
- flask-cors
- lupa==1.10
- transformers[sentencepiece]==4.33.1
- transformers[sentencepiece]==4.34.0
- huggingface_hub==0.16.4
- optimum[onnxruntime]==1.12.0
- optimum[onnxruntime]==1.13.2
- safetensors==0.3.3
- accelerate==0.21.0
- git+https://github.com/VE-FORBRYDERNE/mkultra
@@ -49,13 +51,16 @@ dependencies:
- git+https://github.com/0cc4m/hf_bleeding_edge/
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- einops
- peft==0.3.0
- scipy
- https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- windows-curses; sys_platform == 'win32'
- pynvml
- xformers==0.0.21
- https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'

View File

@@ -23,7 +23,7 @@ dependencies:
- Pillow
- psutil
- pip:
- -f https://developer.intel.com/ipex-whl-stable-xpu
- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
- torch==2.0.1a0; sys_platform == 'linux'
- torch==2.0.0a0; sys_platform == 'win32'
- intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux'
@@ -33,9 +33,9 @@ dependencies:
- flask-ngrok
- flask-cors
- lupa==1.10
- transformers[sentencepiece]==4.33.1
- transformers[sentencepiece]==4.34.0
- huggingface_hub==0.16.4
- optimum[openvino,nncf,neural-compressor]==1.12.0
- optimum[onnxruntime,openvino,nncf,neural-compressor]==1.13.2
- safetensors==0.3.3
- accelerate==0.21.0
- git+https://github.com/VE-FORBRYDERNE/mkultra
@@ -49,8 +49,8 @@ dependencies:
- git+https://github.com/0cc4m/hf_bleeding_edge/
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
- einops
- peft==0.3.0
- scipy

View File

@@ -29,9 +29,9 @@ dependencies:
- flask-ngrok
- flask-cors
- lupa==1.10
- transformers[sentencepiece]==4.33.1
- transformers[sentencepiece]==4.34.0
- huggingface_hub==0.16.4
- optimum[onnxruntime]==1.12.0
- optimum[onnxruntime]==1.13.2
- safetensors==0.3.3
- accelerate==0.21.0
- git+https://github.com/VE-FORBRYDERNE/mkultra
@@ -45,4 +45,5 @@ dependencies:
- einops
- peft==0.3.0
- windows-curses; sys_platform == 'win32'
- pynvml
- pynvml
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+rocm5.4.2-cp38-cp38-linux_x86_64.whl

View File

@@ -148,6 +148,13 @@ class model_backend(InferenceModel):
self.get_local_model_path(ignore_existance=True),
)
if not self.get_local_model_path():
print(self.get_local_model_path())
from huggingface_hub import snapshot_download
target_dir = "models/" + self.model_name.replace("/", "_")
print(self.model_name)
snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
self.init_model_config()
self.model = AutoModelForCausalLM.from_pretrained(

View File

@@ -128,6 +128,12 @@ class model_backend(InferenceModel):
return config
def _load(self, save_model: bool, initial_load: bool) -> None:
if not self.get_local_model_path():
from huggingface_hub import snapshot_download
target_dir = "models/" + self.model_name.replace("/", "_")
print(self.model_name)
snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
self.model = self._get_model(self.get_local_model_path(), {})
self.tokenizer = self._get_tokenizer(self.get_local_model_path())

View File

@@ -0,0 +1,422 @@
from __future__ import annotations
try:
import time, json
import torch
import requests
import numpy as np
from typing import List, Optional, Union
import os
import glob
from pathlib import Path
import re
import warnings
import gc
import utils
from logger import logger
from modeling import warpers
from modeling.warpers import Warper
from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
InferenceModel,
ModelCapabilities,
)
from modeling.tokenizer import GenericTokenizer
from exllamav2.model import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
from transformers import LlamaTokenizer
from exllamav2.generator import ExLlamaV2StreamingGenerator
load_failed = False
except:
load_failed = True
model_backend_type = "GPTQ"
model_backend_name = "ExLlama V2"
# When set to true, messages will appear in the console if samplers are not
# changing the scores. Keep in mind some samplers don't always change the
# scores for each token.
LOG_SAMPLER_NO_EFFECT = False
class model_backend(InferenceModel):
def __init__(self) -> None:
super().__init__()
self.model_config = None
self.model = None
self.tokenizer = None
self.cache = None
self.generator = None
self.model_name = ""
self.path = None
self.post_token_hooks = [
PostTokenHooks.stream_tokens,
]
self.stopper_hooks = [
Stoppers.core_stopper,
Stoppers.dynamic_wi_scanner,
Stoppers.singleline_stopper,
Stoppers.chat_mode_stopper,
Stoppers.stop_sequence_stopper,
]
self.capabilties = ModelCapabilities(
embedding_manipulation=False,
post_token_hooks=True,
stopper_hooks=True,
post_token_probs=False,
)
self.disable = load_failed
def is_valid(self, model_name, model_path, menu_path):
try:
self.model_config = self._load_config(model_name, model_path)
#TODO check if model is valid
return True
except:
return False
def get_local_model_path(self):
return self.path or os.path.join("models", self.model_name.replace("/", "_"))
def _load_config(self, model_name, model_path):
config = ExLlamaV2Config()
if model_path is not None and os.path.exists(model_path):
config.model_dir = model_path
elif os.path.exists("models/{}".format(model_name.replace('/', '_'))):
config.model_dir = "models/{}".format(model_name.replace('/', '_'))
config.prepare()
return config
def _load(self, save_model: bool, initial_load: bool) -> None:
if not self.get_local_model_path():
from huggingface_hub import snapshot_download
target_dir = "models/" + self.model_name.replace("/", "_")
print(self.model_name)
snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
self.model = self._get_model(self.get_local_model_path(), {})
#TODO support GPU split
self.model.load(None)
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
self.cache = ExLlamaV2Cache(self.model)
self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer.tokenizer)
def _post_load(self) -> None:
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
self.tokenizer.add_bos_token = False
# HF transformers no longer supports decode_with_prefix_space
# We work around this by wrapping decode, encode, and __call__
# with versions that work around the 'prefix space' misfeature
# of sentencepiece.
vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("")}
# Wrap 'decode' with a method that always returns text starting with a space
# when the head token starts with a space. This is what 'decode_with_prefix_space'
# used to do, and we implement it using the same technique (building a cache of
# tokens that should have a prefix space, and then prepending a space if the first
# token is in this set.) We also work around a bizarre behavior in which decoding
# a single token 13 behaves differently than decoding a squence containing only [13].
original_decode = type(self.tokenizer.tokenizer).decode
def decode_wrapper(self, token_ids, *args, **kwargs):
first = None
# Note, the code below that wraps single-value token_ids in a list
# is to work around this wonky behavior:
# >>> t.decode(13)
# '<0x0A>'
# >>> t.decode([13])
# '\n'
# Not doing this causes token streaming to receive <0x0A> characters
# instead of newlines.
if isinstance(token_ids, int):
first = token_ids
token_ids = [first]
elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
# Tensors don't support the Python standard of 'empty is False'
# and the special case of dimension 0 tensors also needs to be
# handled separately.
if token_ids.dim() == 0:
first = int(token_ids.item())
token_ids = [first]
elif len(token_ids) > 0:
first = int(token_ids[0])
elif token_ids is not None and len(token_ids) > 0:
first = token_ids[0]
result = original_decode(self, token_ids, *args, **kwargs)
if first is not None and first in has_prefix_space:
result = " " + result
return result
# GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
# Wrap encode and __call__ to work around the 'prefix space' misfeature also.
# The problem is that "Bob" at the start of text is encoded as if it is
# " Bob". This creates a problem because it means you can't split text, encode
# the pieces, concatenate the tokens, decode them, and get the original text back.
# The workaround is to prepend a known token that (1) starts with a space; and
# (2) is not the prefix of any other token. After searching through the vocab
# " ," (space comma) is the only token containing only printable ascii characters
# that fits this bill. By prepending ',' to the text, the original encode
# method always returns [1919, ...], where the tail of the sequence is the
# actual encoded result we want without the prefix space behavior.
original_encode = type(self.tokenizer.tokenizer).encode
def encode_wrapper(self, text, *args, **kwargs):
if type(text) is str:
text = ',' + text
result = original_encode(self, text, *args, **kwargs)
result = result[1:]
else:
result = original_encode(self, text, *args, **kwargs)
return result
object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
# Since 'encode' is documented as being deprecated, also override __call__.
# This doesn't appear to currently be used by KoboldAI, but doing so
# in case someone uses it in the future.
original_call = type(self.tokenizer.tokenizer).__call__
def call_wrapper(self, text, *args, **kwargs):
if type(text) is str:
text = ',' + text
result = original_call(self, text, *args, **kwargs)
result = result[1:]
else:
result = original_call(self, text, *args, **kwargs)
return result
object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
# Cache the newline token (for single line mode)
# Since there is only one Llama token containing newline, just encode \n
self.newline_tokens = self.tokenizer.encode("\n")
self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok]
self.tokenizer._koboldai_header = self.tokenizer.encode("")
def unload(self):
#self.model_config = None # This breaks more than it fixes - Henk
self.model = None
self.tokenizer = None
self.cache = None
self.generator = None
self.model_name = ""
self.path = None
with torch.no_grad():
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
for tensor in gc.get_objects():
try:
if torch.is_tensor(tensor):
tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
except:
pass
gc.collect()
try:
with torch.no_grad():
torch.cuda.empty_cache()
except:
pass
def _apply_warpers(
self, scores: torch.Tensor, input_ids: torch.Tensor
) -> torch.Tensor:
warpers.update_settings()
if LOG_SAMPLER_NO_EFFECT:
pre = torch.Tensor(scores)
for sid in utils.koboldai_vars.sampler_order:
warper = Warper.from_id(sid)
if not warper.value_is_valid():
continue
if warper == warpers.RepetitionPenalty:
# Rep pen needs more data than other samplers
scores = warper.torch(scores, input_ids=input_ids.cuda())
else:
scores = warper.torch(scores)
assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
if LOG_SAMPLER_NO_EFFECT:
if torch.equal(pre, scores):
logger.info(warper, "had no effect on the scores.")
pre = torch.Tensor(scores)
return scores
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],
max_new: int,
gen_settings: GenerationSettings,
single_line: bool = False,
batch_count: int = 1,
seed: Optional[int] = None,
**kwargs,
) -> GenerationResult:
if seed:
torch.manual_seed(seed)
bad_words_ids = [self.tokenizer.bos_token_id]
if utils.koboldai_vars.use_default_badwordsids:
bad_words_ids.append(self.tokenizer.eos_token_id)
bad_words_ids.extend(self.bracket_tokens)
if single_line:
bad_words_ids.extend(self.newline_tokens)
if not isinstance(prompt_tokens, torch.Tensor):
gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
else:
gen_in = prompt_tokens
self.generator._gen_begin_reuse(gen_in, None)
for i in range(max_new):
logits = self.model.forward(self.generator.sequence_ids[:, -1:], self.generator.cache)
for bad_word_id in bad_words_ids:
logits[:, :, bad_word_id] = -10000.0
logits = torch.unsqueeze(logits[0, -1, :], 0)
scores = self._apply_warpers(logits, gen_in)
scores = torch.softmax(scores, dim=-1)
# Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841)
# With low probability, multinomial can return an element with zero weight. Since this
# happens infrequently, just sample repeatedly until all tokens have non-zero probability.
for _ in range(100):
token = torch.multinomial(scores, 1)
# Verify that all selected tokens correspond to positive probabilities.
if (scores.gather(1, token) > 0).all():
break
if (token == self.tokenizer.eos_token_id).any():
break
if self.generator.sequence_ids is None:
self.generator.sequence_ids = token
else:
self.generator.sequence_ids = torch.cat([self.generator.sequence_ids, token.cpu()], dim=1)
self._post_token_gen(self.generator.sequence_ids)
utils.koboldai_vars.generated_tkns += 1
# Apply stoppers
do_stop = False
for stopper in self.stopper_hooks:
do_stop = stopper(self, self.generator.sequence_ids)
if do_stop:
break
if do_stop:
break
seq = self.generator.sequence_ids[:, gen_in.size(1):]
return GenerationResult(
model=self,
out_batches=np.array(seq,),
prompt=prompt_tokens,
is_whole_generation=True,
single_line=single_line,
)
def _get_model(self, location: str, tf_kwargs: Dict):
if not self.model_config:
self.model_config = ExLlamaV2Config()
self.model_config.model_dir = location
self.model_config.prepare()
# self.model_config.gpu_peer_fix = True
return ExLlamaV2(self.model_config)
def _get_tokenizer(self, location: str):
tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
return tokenizer
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
requested_parameters = []
gpu_count = torch.cuda.device_count()
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
requested_parameters.append({
"uitype": "slider",
"unit": "int",
"label": "Maximum Context",
"id": "max_ctx",
"min": 2048,
"max": 16384,
"step": 512,
"default": 2048,
"tooltip": "The maximum context size the model supports",
"menu_path": "Configuration",
"extra_classes": "",
"refresh_model_inputs": False
})
requested_parameters.append({
"uitype": "slider",
"unit": "float",
"label": "Embedding Compression",
"id": "compress_emb",
"min": 1,
"max": 8,
"step": 0.25,
"default": 1,
"tooltip": "If the model requires compressed embeddings, set them here",
"menu_path": "Configuration",
"extra_classes": "",
"refresh_model_inputs": False
})
requested_parameters.append({
"uitype": "slider",
"unit": "float",
"label": "NTK alpha",
"id": "ntk_alpha",
"min": 1,
"max": 32,
"step": 0.25,
"default": 1,
"tooltip": "NTK alpha value",
"menu_path": "Configuration",
"extra_classes": "",
"refresh_model_inputs": False
})
return requested_parameters
def set_input_parameters(self, parameters):
gpu_count = torch.cuda.device_count()
self.model_config.max_seq_len = parameters["max_ctx"]
self.model_config.compress_pos_emb = parameters["compress_emb"]
self.model_config.alpha_value = parameters["ntk_alpha"]
# Disable half2 for HIP
self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
self.model_config.rope_no_half2 = bool(torch.version.hip)
self.model_config.matmul_no_half2 = bool(torch.version.hip)
self.model_config.silu_no_half2 = bool(torch.version.hip)
# Disable scaled_dot_product_attention if torch version < 2
if torch.__version__.startswith("1."):
self.model_config.sdp_thd = 0
self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
self.path = parameters['path'] if 'path' in parameters else None

View File

@@ -232,7 +232,7 @@ class HFInferenceModel(InferenceModel):
self.model_type = str(self.model_config.model_type)
# These are model specific tokenizer overrides if a model has bad defaults
if self.model_type == "llama":
if self.model_type == "llama" or self.model_type == "mistral":
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
self.tokenizer.add_bos_token = False
self.tokenizer.legacy = False

View File

@@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements
torch.cuda.device = torch.xpu.device
torch.cuda.device_count = torch.xpu.device_count
torch.cuda.device_of = torch.xpu.device_of
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
torch.cuda.get_device_name = torch.xpu.get_device_name
torch.cuda.get_device_properties = torch.xpu.get_device_properties
torch.cuda.init = torch.xpu.init
@@ -145,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements
ipex._C._DeviceProperties.minor = 2
#Fix functions with ipex:
torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
torch._utils._get_available_device_type = lambda: "xpu"
torch.has_cuda = True
torch.cuda.has_half = True
@@ -157,6 +156,12 @@ def ipex_init(): # pylint: disable=too-many-statements
torch.cuda.get_device_properties.minor = 7
torch.cuda.ipc_collect = lambda *args, **kwargs: None
torch.cuda.utilization = lambda *args, **kwargs: 0
if hasattr(torch.xpu, 'getDeviceIdListForCard'):
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
else:
torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card
ipex_hijacks()
attention_init()

View File

@@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None):
#ARC GPUs can't allocate more than 4GB to a single block, Slice it:
batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
block_multiply = input.element_size()
slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
block_size = batch_size_attention * slice_block_size
split_slice_size = batch_size_attention
if block_size >= 4000:
if block_size > 4:
do_split = True
#Find something divisible with the input_tokens
while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
while (split_slice_size * slice_block_size) > 4:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
@@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None):
else:
do_split = False
split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
split_2_slice_size = input_tokens
if split_block_size >= 4000:
if split_slice_size * slice_block_size > 4:
slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
do_split_2 = True
#Find something divisible with the input_tokens
while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
while (split_2_slice_size * slice_block_size2) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
@@ -71,13 +73,16 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
else:
shape_one, batch_size_attention, query_tokens, shape_four = query.shape
no_shape_one = False
block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
block_multiply = query.element_size()
slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
block_size = batch_size_attention * slice_block_size
split_slice_size = batch_size_attention
if block_size >= 4000:
if block_size > 4:
do_split = True
#Find something divisible with the shape_one
while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
while (split_slice_size * slice_block_size) > 4:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
@@ -85,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
else:
do_split = False
split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
split_2_slice_size = query_tokens
if split_block_size >= 4000:
if split_slice_size * slice_block_size > 4:
slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
do_split_2 = True
#Find something divisible with the batch_size_attention
while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
while (split_2_slice_size * slice_block_size2) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1

View File

@@ -55,13 +55,14 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
)
#ARC GPUs can't allocate more than 4GB to a single block, Slice it:
block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
block_multiply = query.element_size()
slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply
block_size = query_tokens * slice_block_size
split_2_slice_size = query_tokens
if block_size >= 4000:
if block_size > 4:
do_split_2 = True
#Find something divisible with the query_tokens
while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
while (split_2_slice_size * slice_block_size) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1

View File

@@ -1,6 +1,6 @@
transformers[sentencepiece]==4.33.1
transformers[sentencepiece]==4.34.0
huggingface_hub==0.16.4
optimum[onnxruntime]==1.12.0
optimum[onnxruntime]==1.13.2
safetensors==0.3.3
Flask==2.3.3
Flask-SocketIO==5.3.2
@@ -41,9 +41,12 @@ git+https://github.com/0cc4m/hf_bleeding_edge/
einops
peft==0.3.0
scipy
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8'
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
windows-curses; sys_platform == 'win32'
pynvml
flash_attn==2.3.0
xformers==0.0.21
exllamav2==0.0.4