Use torch.where to inject the soft prompt instead of torch.cat
This commit is contained in:
parent
248e0bd24b
commit
1556bd32a5
68
aiserver.py
68
aiserver.py
|
@ -515,17 +515,25 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
|
||||||
def patch_causallm(cls):
|
def patch_causallm(cls):
|
||||||
old_forward = cls.forward
|
old_forward = cls.forward
|
||||||
def new_causallm_forward(self, *args, **kwargs):
|
def new_causallm_forward(self, *args, **kwargs):
|
||||||
|
num_embeddings = self.config.vocab_size
|
||||||
|
if(vars.sp is not None):
|
||||||
|
num_embeddings += vars.sp.shape[0]
|
||||||
|
if(self.transformer.wte.num_embeddings != num_embeddings):
|
||||||
|
self.resize_token_embeddings(num_embeddings)
|
||||||
input_ids = kwargs.get('input_ids').to(self.device)
|
input_ids = kwargs.get('input_ids').to(self.device)
|
||||||
assert input_ids is not None
|
assert input_ids is not None
|
||||||
kwargs['input_ids'] = None
|
kwargs['input_ids'] = None
|
||||||
inputs_embeds = self.transformer.wte(input_ids)
|
inputs_embeds = self.transformer.wte(input_ids)
|
||||||
|
input_ids -= self.config.vocab_size
|
||||||
if(vars.sp is not None):
|
if(vars.sp is not None):
|
||||||
inputs_embeds = torch.cat((
|
vars.sp = vars.sp.to(inputs_embeds.device)
|
||||||
vars.sp.tile((inputs_embeds.shape[0], 1, 1)),
|
inputs_embeds = torch.where(
|
||||||
inputs_embeds
|
(input_ids >= 0)[:, :, None],
|
||||||
), dim=1).to(self.device)
|
vars.sp[input_ids.clamp(min=0)],
|
||||||
|
inputs_embeds,
|
||||||
|
)
|
||||||
kwargs['inputs_embeds'] = inputs_embeds
|
kwargs['inputs_embeds'] = inputs_embeds
|
||||||
return old_forward(*args, **kwargs)
|
return old_forward(self, *args, **kwargs)
|
||||||
cls.forward = new_causallm_forward
|
cls.forward = new_causallm_forward
|
||||||
for cls in (GPT2LMHeadModel, GPTNeoForCausalLM):
|
for cls in (GPT2LMHeadModel, GPTNeoForCausalLM):
|
||||||
patch_causallm(cls)
|
patch_causallm(cls)
|
||||||
|
@ -543,13 +551,14 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
|
||||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||||
if(vars.hascuda):
|
if(vars.hascuda):
|
||||||
if(vars.usegpu):
|
if(vars.usegpu):
|
||||||
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
|
model = model.to(0)
|
||||||
|
generator = model.generate
|
||||||
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
||||||
device_config(model)
|
device_config(model)
|
||||||
else:
|
else:
|
||||||
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
|
generator = model.generate
|
||||||
else:
|
else:
|
||||||
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
|
generator = model.generate
|
||||||
# If custom GPT2 model was chosen
|
# If custom GPT2 model was chosen
|
||||||
elif(vars.model == "GPT2Custom"):
|
elif(vars.model == "GPT2Custom"):
|
||||||
model_config = open(vars.custmodpth + "/config.json", "r")
|
model_config = open(vars.custmodpth + "/config.json", "r")
|
||||||
|
@ -559,9 +568,10 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth)
|
tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth)
|
||||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||||
if(vars.hascuda and vars.usegpu):
|
if(vars.hascuda and vars.usegpu):
|
||||||
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
|
model = model.to(0)
|
||||||
|
generator = model.generate
|
||||||
else:
|
else:
|
||||||
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
|
generator = model.generate
|
||||||
# If base HuggingFace model was chosen
|
# If base HuggingFace model was chosen
|
||||||
else:
|
else:
|
||||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||||
|
@ -570,16 +580,17 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly"]):
|
||||||
if(vars.usegpu):
|
if(vars.usegpu):
|
||||||
model = AutoModelForCausalLM.from_pretrained(vars.model, device=0)
|
model = AutoModelForCausalLM.from_pretrained(vars.model, device=0)
|
||||||
vars.modeldim = int(model.transformer.hidden_size)
|
vars.modeldim = int(model.transformer.hidden_size)
|
||||||
generator = pipeline('text-generation', model=model, device=0)
|
model = model.to(0)
|
||||||
|
generator = model.generate
|
||||||
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
||||||
model = AutoModelForCausalLM.from_pretrained(vars.model)
|
model = AutoModelForCausalLM.from_pretrained(vars.model)
|
||||||
device_config(model)
|
device_config(model)
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(vars.model)
|
model = AutoModelForCausalLM.from_pretrained(vars.model)
|
||||||
generator = pipeline('text-generation', model=vars.model)
|
generator = model.generate
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(vars.model)
|
model = AutoModelForCausalLM.from_pretrained(vars.model)
|
||||||
generator = pipeline('text-generation', model=vars.model)
|
generator = model.generate
|
||||||
|
|
||||||
# Suppress Author's Note by flagging square brackets (Old implementation)
|
# Suppress Author's Note by flagging square brackets (Old implementation)
|
||||||
#vocab = tokenizer.get_vocab()
|
#vocab = tokenizer.get_vocab()
|
||||||
|
@ -1176,7 +1187,10 @@ def calcsubmit(txt):
|
||||||
budget = vars.max_length - lnsp - lnprompt - lnmem - lnanote - lnwi - vars.genamt
|
budget = vars.max_length - lnsp - lnprompt - lnmem - lnanote - lnwi - vars.genamt
|
||||||
else:
|
else:
|
||||||
budget = vars.max_length - lnsp - lnmem - lnanote - lnwi - vars.genamt
|
budget = vars.max_length - lnsp - lnmem - lnanote - lnwi - vars.genamt
|
||||||
|
|
||||||
|
if(vars.sp is not None):
|
||||||
|
budget -= vars.sp.shape[0]
|
||||||
|
|
||||||
if(actionlen == 0):
|
if(actionlen == 0):
|
||||||
# First/Prompt action
|
# First/Prompt action
|
||||||
subtxt = vars.memory + winfo + anotetxt + vars.prompt
|
subtxt = vars.memory + winfo + anotetxt + vars.prompt
|
||||||
|
@ -1336,14 +1350,23 @@ def generate(txt, min, max):
|
||||||
top_k = vars.top_k if vars.top_k > 0 else None
|
top_k = vars.top_k if vars.top_k > 0 else None
|
||||||
tfs = vars.tfs if vars.tfs > 0.0 else None
|
tfs = vars.tfs if vars.tfs > 0.0 else None
|
||||||
|
|
||||||
# generator() only accepts a torch tensor of tokens (long datatype) as
|
gen_in = tokenizer.encode(txt, return_tensors="pt", truncation=True).long()
|
||||||
# its first argument if we're using breakmodel, otherwise a string
|
if(vars.sp is not None):
|
||||||
# is fine
|
soft_tokens = torch.arange(
|
||||||
if(vars.hascuda and vars.breakmodel):
|
model.config.vocab_size,
|
||||||
gen_in = tokenizer.encode(txt, return_tensors="pt", truncation=True).long().to(breakmodel.primary_device)
|
model.config.vocab_size + vars.sp.shape[0],
|
||||||
|
)
|
||||||
|
gen_in = torch.cat((soft_tokens[None], gen_in), dim=1)
|
||||||
|
|
||||||
|
if(vars.hascuda and vars.usegpu):
|
||||||
|
gen_in = gen_in.to(0)
|
||||||
|
elif(vars.hascuda and vars.breakmodel):
|
||||||
|
gen_in = gen_in.to(breakmodel.primary_device)
|
||||||
|
elif(vars.hascuda):
|
||||||
|
gen_in = gen_in.to(0)
|
||||||
else:
|
else:
|
||||||
gen_in = txt
|
gen_in = gen_in.to('cpu')
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
genout = generator(
|
genout = generator(
|
||||||
gen_in,
|
gen_in,
|
||||||
|
@ -1367,8 +1390,7 @@ def generate(txt, min, max):
|
||||||
return
|
return
|
||||||
|
|
||||||
# Need to manually strip and decode tokens if we're not using a pipeline
|
# Need to manually strip and decode tokens if we're not using a pipeline
|
||||||
if(vars.hascuda and vars.breakmodel):
|
genout = [{"generated_text": tokenizer.decode(tokens[len(gen_in[0])-len(tokens):])} for tokens in genout]
|
||||||
genout = [{"generated_text": tokenizer.decode(tokens[len(gen_in[0])-len(tokens):])} for tokens in genout]
|
|
||||||
|
|
||||||
if(len(genout) == 1):
|
if(len(genout) == 1):
|
||||||
genresult(genout[0]["generated_text"])
|
genresult(genout[0]["generated_text"])
|
||||||
|
|
Loading…
Reference in New Issue