mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Add gpt_bigcode support, fix 8-bit GPTQ incoherence
This commit is contained in:
@@ -11,3 +11,6 @@
|
|||||||
|
|
||||||
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl</a>
|
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl</a>
|
||||||
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl</a>
|
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl</a>
|
||||||
|
|
||||||
|
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl</a>
|
||||||
|
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl</a>
|
||||||
|
@@ -47,6 +47,6 @@ dependencies:
|
|||||||
- diffusers
|
- diffusers
|
||||||
- git+https://github.com/0cc4m/hf_bleeding_edge/
|
- git+https://github.com/0cc4m/hf_bleeding_edge/
|
||||||
- --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
|
- --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
|
||||||
- gptq_koboldai==0.0.4
|
- gptq_koboldai==0.0.5
|
||||||
- einops
|
- einops
|
||||||
- peft==0.3.0
|
- peft==0.3.0
|
||||||
|
@@ -35,6 +35,7 @@ from gptq.gptj import load_quant as gptj_load_quant
|
|||||||
from gptq.gptneox import load_quant as gptneox_load_quant
|
from gptq.gptneox import load_quant as gptneox_load_quant
|
||||||
from gptq.llama import load_quant as llama_load_quant
|
from gptq.llama import load_quant as llama_load_quant
|
||||||
from gptq.opt import load_quant as opt_load_quant
|
from gptq.opt import load_quant as opt_load_quant
|
||||||
|
from gptq.bigcode import load_quant as bigcode_load_quant
|
||||||
from gptq.mpt import load_quant as mpt_load_quant
|
from gptq.mpt import load_quant as mpt_load_quant
|
||||||
from gptq.offload import load_quant_offload
|
from gptq.offload import load_quant_offload
|
||||||
|
|
||||||
@@ -220,6 +221,8 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
|
model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
|
||||||
elif model_type == "mpt":
|
elif model_type == "mpt":
|
||||||
model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
|
model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
|
||||||
|
elif model_type == "gpt_bigcode":
|
||||||
|
model = load_quant_offload(bigcode_load_quant, location, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list).half()
|
||||||
elif autogptq_support:
|
elif autogptq_support:
|
||||||
# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
|
# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
|
||||||
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
|
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
|
||||||
|
Reference in New Issue
Block a user