mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Merge branch 'concedo_united_PR' of https://github.com/LostRuins/KoboldAI into concedo_united_PR
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
name: koboldai
|
name: koboldai
|
||||||
channels:
|
channels:
|
||||||
- pytorch
|
- pytorch
|
||||||
- nvidia
|
- nvidia/label/cuda-11.8.0
|
||||||
- conda-forge
|
- conda-forge
|
||||||
- defaults
|
- defaults
|
||||||
dependencies:
|
dependencies:
|
||||||
@@ -13,6 +13,8 @@ dependencies:
|
|||||||
- pytorch=2.0.*
|
- pytorch=2.0.*
|
||||||
- python=3.8.*
|
- python=3.8.*
|
||||||
- pytorch-cuda=11.8
|
- pytorch-cuda=11.8
|
||||||
|
- cuda-nvcc=11.8
|
||||||
|
- cuda-libraries-dev=11.8
|
||||||
- eventlet=0.33.3
|
- eventlet=0.33.3
|
||||||
- dnspython=2.2.1
|
- dnspython=2.2.1
|
||||||
- markdown
|
- markdown
|
||||||
@@ -31,9 +33,9 @@ dependencies:
|
|||||||
- flask-ngrok
|
- flask-ngrok
|
||||||
- flask-cors
|
- flask-cors
|
||||||
- lupa==1.10
|
- lupa==1.10
|
||||||
- transformers[sentencepiece]==4.33.1
|
- transformers[sentencepiece]==4.34.0
|
||||||
- huggingface_hub==0.16.4
|
- huggingface_hub==0.16.4
|
||||||
- optimum[onnxruntime]==1.12.0
|
- optimum[onnxruntime]==1.13.2
|
||||||
- safetensors==0.3.3
|
- safetensors==0.3.3
|
||||||
- accelerate==0.21.0
|
- accelerate==0.21.0
|
||||||
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
||||||
@@ -49,13 +51,16 @@ dependencies:
|
|||||||
- git+https://github.com/0cc4m/hf_bleeding_edge/
|
- git+https://github.com/0cc4m/hf_bleeding_edge/
|
||||||
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
||||||
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
||||||
- einops
|
- einops
|
||||||
- peft==0.3.0
|
- peft==0.3.0
|
||||||
- scipy
|
- scipy
|
||||||
- https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
- https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
- https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
- https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
||||||
|
- https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
|
- https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
||||||
- windows-curses; sys_platform == 'win32'
|
- windows-curses; sys_platform == 'win32'
|
||||||
- pynvml
|
- pynvml
|
||||||
- xformers==0.0.21
|
- xformers==0.0.21
|
||||||
|
- https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
|
@@ -23,7 +23,7 @@ dependencies:
|
|||||||
- Pillow
|
- Pillow
|
||||||
- psutil
|
- psutil
|
||||||
- pip:
|
- pip:
|
||||||
- -f https://developer.intel.com/ipex-whl-stable-xpu
|
- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||||
- torch==2.0.1a0; sys_platform == 'linux'
|
- torch==2.0.1a0; sys_platform == 'linux'
|
||||||
- torch==2.0.0a0; sys_platform == 'win32'
|
- torch==2.0.0a0; sys_platform == 'win32'
|
||||||
- intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux'
|
- intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux'
|
||||||
@@ -33,9 +33,9 @@ dependencies:
|
|||||||
- flask-ngrok
|
- flask-ngrok
|
||||||
- flask-cors
|
- flask-cors
|
||||||
- lupa==1.10
|
- lupa==1.10
|
||||||
- transformers[sentencepiece]==4.33.1
|
- transformers[sentencepiece]==4.34.0
|
||||||
- huggingface_hub==0.16.4
|
- huggingface_hub==0.16.4
|
||||||
- optimum[openvino,nncf,neural-compressor]==1.12.0
|
- optimum[onnxruntime,openvino,nncf,neural-compressor]==1.13.2
|
||||||
- safetensors==0.3.3
|
- safetensors==0.3.3
|
||||||
- accelerate==0.21.0
|
- accelerate==0.21.0
|
||||||
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
||||||
@@ -49,8 +49,8 @@ dependencies:
|
|||||||
- git+https://github.com/0cc4m/hf_bleeding_edge/
|
- git+https://github.com/0cc4m/hf_bleeding_edge/
|
||||||
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
- https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
||||||
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
|
||||||
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
|
||||||
- einops
|
- einops
|
||||||
- peft==0.3.0
|
- peft==0.3.0
|
||||||
- scipy
|
- scipy
|
||||||
|
@@ -29,9 +29,9 @@ dependencies:
|
|||||||
- flask-ngrok
|
- flask-ngrok
|
||||||
- flask-cors
|
- flask-cors
|
||||||
- lupa==1.10
|
- lupa==1.10
|
||||||
- transformers[sentencepiece]==4.33.1
|
- transformers[sentencepiece]==4.34.0
|
||||||
- huggingface_hub==0.16.4
|
- huggingface_hub==0.16.4
|
||||||
- optimum[onnxruntime]==1.12.0
|
- optimum[onnxruntime]==1.13.2
|
||||||
- safetensors==0.3.3
|
- safetensors==0.3.3
|
||||||
- accelerate==0.21.0
|
- accelerate==0.21.0
|
||||||
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
||||||
@@ -45,4 +45,5 @@ dependencies:
|
|||||||
- einops
|
- einops
|
||||||
- peft==0.3.0
|
- peft==0.3.0
|
||||||
- windows-curses; sys_platform == 'win32'
|
- windows-curses; sys_platform == 'win32'
|
||||||
- pynvml
|
- pynvml
|
||||||
|
- https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+rocm5.4.2-cp38-cp38-linux_x86_64.whl
|
@@ -148,6 +148,13 @@ class model_backend(InferenceModel):
|
|||||||
self.get_local_model_path(ignore_existance=True),
|
self.get_local_model_path(ignore_existance=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not self.get_local_model_path():
|
||||||
|
print(self.get_local_model_path())
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
target_dir = "models/" + self.model_name.replace("/", "_")
|
||||||
|
print(self.model_name)
|
||||||
|
snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
|
||||||
|
|
||||||
self.init_model_config()
|
self.init_model_config()
|
||||||
|
|
||||||
self.model = AutoModelForCausalLM.from_pretrained(
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
@@ -128,6 +128,12 @@ class model_backend(InferenceModel):
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||||
|
if not self.get_local_model_path():
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
target_dir = "models/" + self.model_name.replace("/", "_")
|
||||||
|
print(self.model_name)
|
||||||
|
snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
|
||||||
|
|
||||||
self.model = self._get_model(self.get_local_model_path(), {})
|
self.model = self._get_model(self.get_local_model_path(), {})
|
||||||
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
||||||
|
|
||||||
|
422
modeling/inference_models/exllamav2/class.py
Normal file
422
modeling/inference_models/exllamav2/class.py
Normal file
@@ -0,0 +1,422 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
try:
|
||||||
|
import time, json
|
||||||
|
import torch
|
||||||
|
import requests
|
||||||
|
import numpy as np
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
import gc
|
||||||
|
|
||||||
|
import utils
|
||||||
|
from logger import logger
|
||||||
|
|
||||||
|
from modeling import warpers
|
||||||
|
from modeling.warpers import Warper
|
||||||
|
from modeling.stoppers import Stoppers
|
||||||
|
from modeling.post_token_hooks import PostTokenHooks
|
||||||
|
from modeling.inference_model import (
|
||||||
|
GenerationResult,
|
||||||
|
GenerationSettings,
|
||||||
|
InferenceModel,
|
||||||
|
ModelCapabilities,
|
||||||
|
)
|
||||||
|
|
||||||
|
from modeling.tokenizer import GenericTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
from exllamav2.model import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
|
||||||
|
from transformers import LlamaTokenizer
|
||||||
|
from exllamav2.generator import ExLlamaV2StreamingGenerator
|
||||||
|
load_failed = False
|
||||||
|
except:
|
||||||
|
load_failed = True
|
||||||
|
|
||||||
|
model_backend_type = "GPTQ"
|
||||||
|
model_backend_name = "ExLlama V2"
|
||||||
|
|
||||||
|
# When set to true, messages will appear in the console if samplers are not
|
||||||
|
# changing the scores. Keep in mind some samplers don't always change the
|
||||||
|
# scores for each token.
|
||||||
|
LOG_SAMPLER_NO_EFFECT = False
|
||||||
|
|
||||||
|
class model_backend(InferenceModel):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.model_config = None
|
||||||
|
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
self.cache = None
|
||||||
|
self.generator = None
|
||||||
|
|
||||||
|
self.model_name = ""
|
||||||
|
self.path = None
|
||||||
|
|
||||||
|
self.post_token_hooks = [
|
||||||
|
PostTokenHooks.stream_tokens,
|
||||||
|
]
|
||||||
|
|
||||||
|
self.stopper_hooks = [
|
||||||
|
Stoppers.core_stopper,
|
||||||
|
Stoppers.dynamic_wi_scanner,
|
||||||
|
Stoppers.singleline_stopper,
|
||||||
|
Stoppers.chat_mode_stopper,
|
||||||
|
Stoppers.stop_sequence_stopper,
|
||||||
|
]
|
||||||
|
|
||||||
|
self.capabilties = ModelCapabilities(
|
||||||
|
embedding_manipulation=False,
|
||||||
|
post_token_hooks=True,
|
||||||
|
stopper_hooks=True,
|
||||||
|
post_token_probs=False,
|
||||||
|
)
|
||||||
|
self.disable = load_failed
|
||||||
|
|
||||||
|
def is_valid(self, model_name, model_path, menu_path):
|
||||||
|
try:
|
||||||
|
self.model_config = self._load_config(model_name, model_path)
|
||||||
|
#TODO check if model is valid
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_local_model_path(self):
|
||||||
|
return self.path or os.path.join("models", self.model_name.replace("/", "_"))
|
||||||
|
|
||||||
|
def _load_config(self, model_name, model_path):
|
||||||
|
config = ExLlamaV2Config()
|
||||||
|
if model_path is not None and os.path.exists(model_path):
|
||||||
|
config.model_dir = model_path
|
||||||
|
elif os.path.exists("models/{}".format(model_name.replace('/', '_'))):
|
||||||
|
config.model_dir = "models/{}".format(model_name.replace('/', '_'))
|
||||||
|
config.prepare()
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||||
|
if not self.get_local_model_path():
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
target_dir = "models/" + self.model_name.replace("/", "_")
|
||||||
|
print(self.model_name)
|
||||||
|
snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision)
|
||||||
|
self.model = self._get_model(self.get_local_model_path(), {})
|
||||||
|
#TODO support GPU split
|
||||||
|
self.model.load(None)
|
||||||
|
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
||||||
|
|
||||||
|
self.cache = ExLlamaV2Cache(self.model)
|
||||||
|
|
||||||
|
self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer.tokenizer)
|
||||||
|
|
||||||
|
def _post_load(self) -> None:
|
||||||
|
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
|
||||||
|
self.tokenizer.add_bos_token = False
|
||||||
|
|
||||||
|
# HF transformers no longer supports decode_with_prefix_space
|
||||||
|
# We work around this by wrapping decode, encode, and __call__
|
||||||
|
# with versions that work around the 'prefix space' misfeature
|
||||||
|
# of sentencepiece.
|
||||||
|
vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
|
||||||
|
has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
|
||||||
|
|
||||||
|
# Wrap 'decode' with a method that always returns text starting with a space
|
||||||
|
# when the head token starts with a space. This is what 'decode_with_prefix_space'
|
||||||
|
# used to do, and we implement it using the same technique (building a cache of
|
||||||
|
# tokens that should have a prefix space, and then prepending a space if the first
|
||||||
|
# token is in this set.) We also work around a bizarre behavior in which decoding
|
||||||
|
# a single token 13 behaves differently than decoding a squence containing only [13].
|
||||||
|
original_decode = type(self.tokenizer.tokenizer).decode
|
||||||
|
def decode_wrapper(self, token_ids, *args, **kwargs):
|
||||||
|
first = None
|
||||||
|
# Note, the code below that wraps single-value token_ids in a list
|
||||||
|
# is to work around this wonky behavior:
|
||||||
|
# >>> t.decode(13)
|
||||||
|
# '<0x0A>'
|
||||||
|
# >>> t.decode([13])
|
||||||
|
# '\n'
|
||||||
|
# Not doing this causes token streaming to receive <0x0A> characters
|
||||||
|
# instead of newlines.
|
||||||
|
if isinstance(token_ids, int):
|
||||||
|
first = token_ids
|
||||||
|
token_ids = [first]
|
||||||
|
elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
|
||||||
|
# Tensors don't support the Python standard of 'empty is False'
|
||||||
|
# and the special case of dimension 0 tensors also needs to be
|
||||||
|
# handled separately.
|
||||||
|
if token_ids.dim() == 0:
|
||||||
|
first = int(token_ids.item())
|
||||||
|
token_ids = [first]
|
||||||
|
elif len(token_ids) > 0:
|
||||||
|
first = int(token_ids[0])
|
||||||
|
elif token_ids is not None and len(token_ids) > 0:
|
||||||
|
first = token_ids[0]
|
||||||
|
result = original_decode(self, token_ids, *args, **kwargs)
|
||||||
|
if first is not None and first in has_prefix_space:
|
||||||
|
result = " " + result
|
||||||
|
return result
|
||||||
|
# GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
|
||||||
|
object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
|
||||||
|
|
||||||
|
# Wrap encode and __call__ to work around the 'prefix space' misfeature also.
|
||||||
|
# The problem is that "Bob" at the start of text is encoded as if it is
|
||||||
|
# " Bob". This creates a problem because it means you can't split text, encode
|
||||||
|
# the pieces, concatenate the tokens, decode them, and get the original text back.
|
||||||
|
# The workaround is to prepend a known token that (1) starts with a space; and
|
||||||
|
# (2) is not the prefix of any other token. After searching through the vocab
|
||||||
|
# " ," (space comma) is the only token containing only printable ascii characters
|
||||||
|
# that fits this bill. By prepending ',' to the text, the original encode
|
||||||
|
# method always returns [1919, ...], where the tail of the sequence is the
|
||||||
|
# actual encoded result we want without the prefix space behavior.
|
||||||
|
original_encode = type(self.tokenizer.tokenizer).encode
|
||||||
|
def encode_wrapper(self, text, *args, **kwargs):
|
||||||
|
if type(text) is str:
|
||||||
|
text = ',' + text
|
||||||
|
result = original_encode(self, text, *args, **kwargs)
|
||||||
|
result = result[1:]
|
||||||
|
else:
|
||||||
|
result = original_encode(self, text, *args, **kwargs)
|
||||||
|
return result
|
||||||
|
object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
|
||||||
|
|
||||||
|
# Since 'encode' is documented as being deprecated, also override __call__.
|
||||||
|
# This doesn't appear to currently be used by KoboldAI, but doing so
|
||||||
|
# in case someone uses it in the future.
|
||||||
|
original_call = type(self.tokenizer.tokenizer).__call__
|
||||||
|
def call_wrapper(self, text, *args, **kwargs):
|
||||||
|
if type(text) is str:
|
||||||
|
text = ',' + text
|
||||||
|
result = original_call(self, text, *args, **kwargs)
|
||||||
|
result = result[1:]
|
||||||
|
else:
|
||||||
|
result = original_call(self, text, *args, **kwargs)
|
||||||
|
return result
|
||||||
|
object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
|
||||||
|
|
||||||
|
# Cache the newline token (for single line mode)
|
||||||
|
# Since there is only one Llama token containing newline, just encode \n
|
||||||
|
self.newline_tokens = self.tokenizer.encode("\n")
|
||||||
|
self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok]
|
||||||
|
self.tokenizer._koboldai_header = self.tokenizer.encode("")
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
#self.model_config = None # This breaks more than it fixes - Henk
|
||||||
|
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
self.cache = None
|
||||||
|
self.generator = None
|
||||||
|
|
||||||
|
self.model_name = ""
|
||||||
|
self.path = None
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
|
||||||
|
for tensor in gc.get_objects():
|
||||||
|
try:
|
||||||
|
if torch.is_tensor(tensor):
|
||||||
|
tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
gc.collect()
|
||||||
|
try:
|
||||||
|
with torch.no_grad():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _apply_warpers(
|
||||||
|
self, scores: torch.Tensor, input_ids: torch.Tensor
|
||||||
|
) -> torch.Tensor:
|
||||||
|
warpers.update_settings()
|
||||||
|
|
||||||
|
if LOG_SAMPLER_NO_EFFECT:
|
||||||
|
pre = torch.Tensor(scores)
|
||||||
|
|
||||||
|
for sid in utils.koboldai_vars.sampler_order:
|
||||||
|
warper = Warper.from_id(sid)
|
||||||
|
|
||||||
|
if not warper.value_is_valid():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if warper == warpers.RepetitionPenalty:
|
||||||
|
# Rep pen needs more data than other samplers
|
||||||
|
scores = warper.torch(scores, input_ids=input_ids.cuda())
|
||||||
|
else:
|
||||||
|
scores = warper.torch(scores)
|
||||||
|
|
||||||
|
assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
|
||||||
|
|
||||||
|
if LOG_SAMPLER_NO_EFFECT:
|
||||||
|
if torch.equal(pre, scores):
|
||||||
|
logger.info(warper, "had no effect on the scores.")
|
||||||
|
pre = torch.Tensor(scores)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def _raw_generate(
|
||||||
|
self,
|
||||||
|
prompt_tokens: Union[List[int], torch.Tensor],
|
||||||
|
max_new: int,
|
||||||
|
gen_settings: GenerationSettings,
|
||||||
|
single_line: bool = False,
|
||||||
|
batch_count: int = 1,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> GenerationResult:
|
||||||
|
if seed:
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
|
||||||
|
bad_words_ids = [self.tokenizer.bos_token_id]
|
||||||
|
if utils.koboldai_vars.use_default_badwordsids:
|
||||||
|
bad_words_ids.append(self.tokenizer.eos_token_id)
|
||||||
|
bad_words_ids.extend(self.bracket_tokens)
|
||||||
|
if single_line:
|
||||||
|
bad_words_ids.extend(self.newline_tokens)
|
||||||
|
|
||||||
|
if not isinstance(prompt_tokens, torch.Tensor):
|
||||||
|
gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
|
||||||
|
else:
|
||||||
|
gen_in = prompt_tokens
|
||||||
|
|
||||||
|
self.generator._gen_begin_reuse(gen_in, None)
|
||||||
|
|
||||||
|
for i in range(max_new):
|
||||||
|
logits = self.model.forward(self.generator.sequence_ids[:, -1:], self.generator.cache)
|
||||||
|
for bad_word_id in bad_words_ids:
|
||||||
|
logits[:, :, bad_word_id] = -10000.0
|
||||||
|
|
||||||
|
logits = torch.unsqueeze(logits[0, -1, :], 0)
|
||||||
|
|
||||||
|
scores = self._apply_warpers(logits, gen_in)
|
||||||
|
|
||||||
|
scores = torch.softmax(scores, dim=-1)
|
||||||
|
|
||||||
|
# Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841)
|
||||||
|
# With low probability, multinomial can return an element with zero weight. Since this
|
||||||
|
# happens infrequently, just sample repeatedly until all tokens have non-zero probability.
|
||||||
|
for _ in range(100):
|
||||||
|
token = torch.multinomial(scores, 1)
|
||||||
|
# Verify that all selected tokens correspond to positive probabilities.
|
||||||
|
if (scores.gather(1, token) > 0).all():
|
||||||
|
break
|
||||||
|
|
||||||
|
if (token == self.tokenizer.eos_token_id).any():
|
||||||
|
break
|
||||||
|
|
||||||
|
if self.generator.sequence_ids is None:
|
||||||
|
self.generator.sequence_ids = token
|
||||||
|
else:
|
||||||
|
self.generator.sequence_ids = torch.cat([self.generator.sequence_ids, token.cpu()], dim=1)
|
||||||
|
|
||||||
|
self._post_token_gen(self.generator.sequence_ids)
|
||||||
|
|
||||||
|
utils.koboldai_vars.generated_tkns += 1
|
||||||
|
|
||||||
|
# Apply stoppers
|
||||||
|
do_stop = False
|
||||||
|
for stopper in self.stopper_hooks:
|
||||||
|
do_stop = stopper(self, self.generator.sequence_ids)
|
||||||
|
if do_stop:
|
||||||
|
break
|
||||||
|
if do_stop:
|
||||||
|
break
|
||||||
|
|
||||||
|
seq = self.generator.sequence_ids[:, gen_in.size(1):]
|
||||||
|
|
||||||
|
return GenerationResult(
|
||||||
|
model=self,
|
||||||
|
out_batches=np.array(seq,),
|
||||||
|
prompt=prompt_tokens,
|
||||||
|
is_whole_generation=True,
|
||||||
|
single_line=single_line,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_model(self, location: str, tf_kwargs: Dict):
|
||||||
|
if not self.model_config:
|
||||||
|
self.model_config = ExLlamaV2Config()
|
||||||
|
self.model_config.model_dir = location
|
||||||
|
self.model_config.prepare()
|
||||||
|
|
||||||
|
# self.model_config.gpu_peer_fix = True
|
||||||
|
return ExLlamaV2(self.model_config)
|
||||||
|
|
||||||
|
def _get_tokenizer(self, location: str):
|
||||||
|
tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||||
|
requested_parameters = []
|
||||||
|
gpu_count = torch.cuda.device_count()
|
||||||
|
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
|
||||||
|
|
||||||
|
requested_parameters.append({
|
||||||
|
"uitype": "slider",
|
||||||
|
"unit": "int",
|
||||||
|
"label": "Maximum Context",
|
||||||
|
"id": "max_ctx",
|
||||||
|
"min": 2048,
|
||||||
|
"max": 16384,
|
||||||
|
"step": 512,
|
||||||
|
"default": 2048,
|
||||||
|
"tooltip": "The maximum context size the model supports",
|
||||||
|
"menu_path": "Configuration",
|
||||||
|
"extra_classes": "",
|
||||||
|
"refresh_model_inputs": False
|
||||||
|
})
|
||||||
|
|
||||||
|
requested_parameters.append({
|
||||||
|
"uitype": "slider",
|
||||||
|
"unit": "float",
|
||||||
|
"label": "Embedding Compression",
|
||||||
|
"id": "compress_emb",
|
||||||
|
"min": 1,
|
||||||
|
"max": 8,
|
||||||
|
"step": 0.25,
|
||||||
|
"default": 1,
|
||||||
|
"tooltip": "If the model requires compressed embeddings, set them here",
|
||||||
|
"menu_path": "Configuration",
|
||||||
|
"extra_classes": "",
|
||||||
|
"refresh_model_inputs": False
|
||||||
|
})
|
||||||
|
|
||||||
|
requested_parameters.append({
|
||||||
|
"uitype": "slider",
|
||||||
|
"unit": "float",
|
||||||
|
"label": "NTK alpha",
|
||||||
|
"id": "ntk_alpha",
|
||||||
|
"min": 1,
|
||||||
|
"max": 32,
|
||||||
|
"step": 0.25,
|
||||||
|
"default": 1,
|
||||||
|
"tooltip": "NTK alpha value",
|
||||||
|
"menu_path": "Configuration",
|
||||||
|
"extra_classes": "",
|
||||||
|
"refresh_model_inputs": False
|
||||||
|
})
|
||||||
|
|
||||||
|
return requested_parameters
|
||||||
|
|
||||||
|
def set_input_parameters(self, parameters):
|
||||||
|
gpu_count = torch.cuda.device_count()
|
||||||
|
|
||||||
|
self.model_config.max_seq_len = parameters["max_ctx"]
|
||||||
|
self.model_config.compress_pos_emb = parameters["compress_emb"]
|
||||||
|
self.model_config.alpha_value = parameters["ntk_alpha"]
|
||||||
|
|
||||||
|
# Disable half2 for HIP
|
||||||
|
self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
|
||||||
|
self.model_config.rope_no_half2 = bool(torch.version.hip)
|
||||||
|
self.model_config.matmul_no_half2 = bool(torch.version.hip)
|
||||||
|
self.model_config.silu_no_half2 = bool(torch.version.hip)
|
||||||
|
|
||||||
|
# Disable scaled_dot_product_attention if torch version < 2
|
||||||
|
if torch.__version__.startswith("1."):
|
||||||
|
self.model_config.sdp_thd = 0
|
||||||
|
|
||||||
|
self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
|
||||||
|
self.path = parameters['path'] if 'path' in parameters else None
|
@@ -232,7 +232,7 @@ class HFInferenceModel(InferenceModel):
|
|||||||
self.model_type = str(self.model_config.model_type)
|
self.model_type = str(self.model_config.model_type)
|
||||||
|
|
||||||
# These are model specific tokenizer overrides if a model has bad defaults
|
# These are model specific tokenizer overrides if a model has bad defaults
|
||||||
if self.model_type == "llama":
|
if self.model_type == "llama" or self.model_type == "mistral":
|
||||||
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
|
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
|
||||||
self.tokenizer.add_bos_token = False
|
self.tokenizer.add_bos_token = False
|
||||||
self.tokenizer.legacy = False
|
self.tokenizer.legacy = False
|
||||||
|
@@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements
|
|||||||
torch.cuda.device = torch.xpu.device
|
torch.cuda.device = torch.xpu.device
|
||||||
torch.cuda.device_count = torch.xpu.device_count
|
torch.cuda.device_count = torch.xpu.device_count
|
||||||
torch.cuda.device_of = torch.xpu.device_of
|
torch.cuda.device_of = torch.xpu.device_of
|
||||||
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
|
|
||||||
torch.cuda.get_device_name = torch.xpu.get_device_name
|
torch.cuda.get_device_name = torch.xpu.get_device_name
|
||||||
torch.cuda.get_device_properties = torch.xpu.get_device_properties
|
torch.cuda.get_device_properties = torch.xpu.get_device_properties
|
||||||
torch.cuda.init = torch.xpu.init
|
torch.cuda.init = torch.xpu.init
|
||||||
@@ -145,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements
|
|||||||
ipex._C._DeviceProperties.minor = 2
|
ipex._C._DeviceProperties.minor = 2
|
||||||
|
|
||||||
#Fix functions with ipex:
|
#Fix functions with ipex:
|
||||||
torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
|
torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
|
||||||
torch._utils._get_available_device_type = lambda: "xpu"
|
torch._utils._get_available_device_type = lambda: "xpu"
|
||||||
torch.has_cuda = True
|
torch.has_cuda = True
|
||||||
torch.cuda.has_half = True
|
torch.cuda.has_half = True
|
||||||
@@ -157,6 +156,12 @@ def ipex_init(): # pylint: disable=too-many-statements
|
|||||||
torch.cuda.get_device_properties.minor = 7
|
torch.cuda.get_device_properties.minor = 7
|
||||||
torch.cuda.ipc_collect = lambda *args, **kwargs: None
|
torch.cuda.ipc_collect = lambda *args, **kwargs: None
|
||||||
torch.cuda.utilization = lambda *args, **kwargs: 0
|
torch.cuda.utilization = lambda *args, **kwargs: 0
|
||||||
|
if hasattr(torch.xpu, 'getDeviceIdListForCard'):
|
||||||
|
torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
|
||||||
|
torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
|
||||||
|
else:
|
||||||
|
torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
|
||||||
|
torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card
|
||||||
|
|
||||||
ipex_hijacks()
|
ipex_hijacks()
|
||||||
attention_init()
|
attention_init()
|
||||||
|
@@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None):
|
|||||||
|
|
||||||
#ARC GPUs can't allocate more than 4GB to a single block, Slice it:
|
#ARC GPUs can't allocate more than 4GB to a single block, Slice it:
|
||||||
batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
|
batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
|
||||||
block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
|
block_multiply = input.element_size()
|
||||||
block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
|
slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
|
||||||
|
block_size = batch_size_attention * slice_block_size
|
||||||
|
|
||||||
split_slice_size = batch_size_attention
|
split_slice_size = batch_size_attention
|
||||||
if block_size >= 4000:
|
if block_size > 4:
|
||||||
do_split = True
|
do_split = True
|
||||||
#Find something divisible with the input_tokens
|
#Find something divisible with the input_tokens
|
||||||
while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
|
while (split_slice_size * slice_block_size) > 4:
|
||||||
split_slice_size = split_slice_size // 2
|
split_slice_size = split_slice_size // 2
|
||||||
if split_slice_size <= 1:
|
if split_slice_size <= 1:
|
||||||
split_slice_size = 1
|
split_slice_size = 1
|
||||||
@@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None):
|
|||||||
else:
|
else:
|
||||||
do_split = False
|
do_split = False
|
||||||
|
|
||||||
split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
|
|
||||||
split_2_slice_size = input_tokens
|
split_2_slice_size = input_tokens
|
||||||
if split_block_size >= 4000:
|
if split_slice_size * slice_block_size > 4:
|
||||||
|
slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
|
||||||
do_split_2 = True
|
do_split_2 = True
|
||||||
#Find something divisible with the input_tokens
|
#Find something divisible with the input_tokens
|
||||||
while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
|
while (split_2_slice_size * slice_block_size2) > 4:
|
||||||
split_2_slice_size = split_2_slice_size // 2
|
split_2_slice_size = split_2_slice_size // 2
|
||||||
if split_2_slice_size <= 1:
|
if split_2_slice_size <= 1:
|
||||||
split_2_slice_size = 1
|
split_2_slice_size = 1
|
||||||
@@ -71,13 +73,16 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
|
|||||||
else:
|
else:
|
||||||
shape_one, batch_size_attention, query_tokens, shape_four = query.shape
|
shape_one, batch_size_attention, query_tokens, shape_four = query.shape
|
||||||
no_shape_one = False
|
no_shape_one = False
|
||||||
block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
|
|
||||||
block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
|
block_multiply = query.element_size()
|
||||||
|
slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
|
||||||
|
block_size = batch_size_attention * slice_block_size
|
||||||
|
|
||||||
split_slice_size = batch_size_attention
|
split_slice_size = batch_size_attention
|
||||||
if block_size >= 4000:
|
if block_size > 4:
|
||||||
do_split = True
|
do_split = True
|
||||||
#Find something divisible with the shape_one
|
#Find something divisible with the shape_one
|
||||||
while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
|
while (split_slice_size * slice_block_size) > 4:
|
||||||
split_slice_size = split_slice_size // 2
|
split_slice_size = split_slice_size // 2
|
||||||
if split_slice_size <= 1:
|
if split_slice_size <= 1:
|
||||||
split_slice_size = 1
|
split_slice_size = 1
|
||||||
@@ -85,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
|
|||||||
else:
|
else:
|
||||||
do_split = False
|
do_split = False
|
||||||
|
|
||||||
split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
|
|
||||||
split_2_slice_size = query_tokens
|
split_2_slice_size = query_tokens
|
||||||
if split_block_size >= 4000:
|
if split_slice_size * slice_block_size > 4:
|
||||||
|
slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
|
||||||
do_split_2 = True
|
do_split_2 = True
|
||||||
#Find something divisible with the batch_size_attention
|
#Find something divisible with the batch_size_attention
|
||||||
while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
|
while (split_2_slice_size * slice_block_size2) > 4:
|
||||||
split_2_slice_size = split_2_slice_size // 2
|
split_2_slice_size = split_2_slice_size // 2
|
||||||
if split_2_slice_size <= 1:
|
if split_2_slice_size <= 1:
|
||||||
split_2_slice_size = 1
|
split_2_slice_size = 1
|
||||||
|
@@ -55,13 +55,14 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
|
|||||||
)
|
)
|
||||||
|
|
||||||
#ARC GPUs can't allocate more than 4GB to a single block, Slice it:
|
#ARC GPUs can't allocate more than 4GB to a single block, Slice it:
|
||||||
block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
|
block_multiply = query.element_size()
|
||||||
block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
|
slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply
|
||||||
|
block_size = query_tokens * slice_block_size
|
||||||
split_2_slice_size = query_tokens
|
split_2_slice_size = query_tokens
|
||||||
if block_size >= 4000:
|
if block_size > 4:
|
||||||
do_split_2 = True
|
do_split_2 = True
|
||||||
#Find something divisible with the query_tokens
|
#Find something divisible with the query_tokens
|
||||||
while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
|
while (split_2_slice_size * slice_block_size) > 4:
|
||||||
split_2_slice_size = split_2_slice_size // 2
|
split_2_slice_size = split_2_slice_size // 2
|
||||||
if split_2_slice_size <= 1:
|
if split_2_slice_size <= 1:
|
||||||
split_2_slice_size = 1
|
split_2_slice_size = 1
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
transformers[sentencepiece]==4.33.1
|
transformers[sentencepiece]==4.34.0
|
||||||
huggingface_hub==0.16.4
|
huggingface_hub==0.16.4
|
||||||
optimum[onnxruntime]==1.12.0
|
optimum[onnxruntime]==1.13.2
|
||||||
safetensors==0.3.3
|
safetensors==0.3.3
|
||||||
Flask==2.3.3
|
Flask==2.3.3
|
||||||
Flask-SocketIO==5.3.2
|
Flask-SocketIO==5.3.2
|
||||||
@@ -41,9 +41,12 @@ git+https://github.com/0cc4m/hf_bleeding_edge/
|
|||||||
einops
|
einops
|
||||||
peft==0.3.0
|
peft==0.3.0
|
||||||
scipy
|
scipy
|
||||||
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10'
|
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10'
|
||||||
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10'
|
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10'
|
||||||
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8'
|
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8'
|
||||||
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
|
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8'
|
||||||
windows-curses; sys_platform == 'win32'
|
windows-curses; sys_platform == 'win32'
|
||||||
pynvml
|
pynvml
|
||||||
|
flash_attn==2.3.0
|
||||||
|
xformers==0.0.21
|
||||||
|
exllamav2==0.0.4
|
Reference in New Issue
Block a user