Merge branch 'united' into accelerate

This commit is contained in:
Gnome Ann 2022-06-18 13:47:38 -04:00
commit e143963161
4 changed files with 91 additions and 17 deletions

View File

@ -80,6 +80,17 @@ def new_init(self, *args, **kwargs):
self.ncols = 99 self.ncols = 99
tqdm.__init__ = new_init tqdm.__init__ = new_init
# Fix some issues with the OPT tokenizer
from transformers import PreTrainedTokenizerBase
old_pretrainedtokenizerbase_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__
@classmethod
def new_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs):
tokenizer = old_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs)
tokenizer._koboldai_header = tokenizer.encode("")
tokenizer.add_bos_token = False
tokenizer.add_prefix_space = False
return tokenizer
PreTrainedTokenizerBase.from_pretrained = new_pretrainedtokenizerbase_from_pretrained
#==================================================================# #==================================================================#
# Variables & Storage # Variables & Storage
@ -323,11 +334,12 @@ class vars:
actionmode = 1 actionmode = 1
dynamicscan = False dynamicscan = False
host = False host = False
flaskwebgui = False
nopromptgen = False nopromptgen = False
rngpersist = False rngpersist = False
nogenmod = False nogenmod = False
welcome = False # Custom Welcome Text (False is default) welcome = False # Custom Welcome Text (False is default)
newlinemode = "n" newlinemode = "ns"
quiet = False # If set will suppress any story text from being printed to the console (will only be seen on the client web page) quiet = False # If set will suppress any story text from being printed to the console (will only be seen on the client web page)
debug = False # If set to true, will send debug information to the client for display debug = False # If set to true, will send debug information to the client for display
lazy_load = True # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage lazy_load = True # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
@ -649,7 +661,7 @@ def loadmodelsettings():
js = {} js = {}
if vars.model_type == "xglm" or js.get("compat", "j") == "fairseq_lm": if vars.model_type == "xglm" or js.get("compat", "j") == "fairseq_lm":
vars.newlinemode = "s" # Default to </s> newline mode if using XGLM vars.newlinemode = "s" # Default to </s> newline mode if using XGLM
if vars.model_type == "opt": if vars.model_type == "opt" or vars.model_type == "bloom":
vars.newlinemode = "ns" # Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them vars.newlinemode = "ns" # Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them
vars.modelconfig = js vars.modelconfig = js
if("badwordsids" in js): if("badwordsids" in js):
@ -1800,6 +1812,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
if(os.path.isdir(vars.custmodpth)): if(os.path.isdir(vars.custmodpth)):
try: try:
tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
except Exception as e:
pass
try:
tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
except Exception as e: except Exception as e:
try: try:
tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
@ -1814,6 +1830,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))): elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
try: try:
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
except Exception as e:
pass
try:
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
except Exception as e: except Exception as e:
try: try:
tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
@ -1841,6 +1861,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
try: try:
tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
except Exception as e:
pass
try:
tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
except Exception as e: except Exception as e:
try: try:
tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
@ -2061,7 +2085,7 @@ def index():
if 'new_ui' in request.args: if 'new_ui' in request.args:
return render_template('index_new.html', hide_ai_menu=args.noaimenu) return render_template('index_new.html', hide_ai_menu=args.noaimenu)
else: else:
return render_template('index.html', hide_ai_menu=args.noaimenu) return render_template('index.html', hide_ai_menu=args.noaimenu, flaskwebgui=vars.flaskwebgui)
@app.route('/favicon.ico') @app.route('/favicon.ico')
def favicon(): def favicon():
return send_from_directory(app.root_path, return send_from_directory(app.root_path,
@ -2759,6 +2783,8 @@ def do_connect():
emit('from_server', {'cmd': 'connected', 'smandelete': vars.smandelete, 'smanrename': vars.smanrename, 'modelname': getmodelname()}) emit('from_server', {'cmd': 'connected', 'smandelete': vars.smandelete, 'smanrename': vars.smanrename, 'modelname': getmodelname()})
if(vars.host): if(vars.host):
emit('from_server', {'cmd': 'runs_remotely'}) emit('from_server', {'cmd': 'runs_remotely'})
if(vars.flaskwebgui):
emit('from_server', {'cmd': 'flaskwebgui'})
if(vars.allowsp): if(vars.allowsp):
emit('from_server', {'cmd': 'allowsp', 'data': vars.allowsp}) emit('from_server', {'cmd': 'allowsp', 'data': vars.allowsp})
@ -3594,24 +3620,26 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
global tokenizer global tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
lnheader = len(tokenizer._koboldai_header)
# Calculate token budget # Calculate token budget
prompttkns = tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', vars.prompt)), max_length=int(2e9), truncation=True) prompttkns = tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', vars.prompt)), max_length=int(2e9), truncation=True)
lnprompt = len(prompttkns) lnprompt = len(prompttkns)
memtokens = tokenizer.encode(utils.encodenewlines(mem), max_length=int(2e9), truncation=True) memtokens = tokenizer.encode(utils.encodenewlines(mem), max_length=int(2e9), truncation=True)
lnmem = len(memtokens) lnmem = len(memtokens)
if(lnmem > vars.max_length - lnsp - vars.genamt - budget_deduction): if(lnmem > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
raise OverflowError("The memory in your story is too long. Please either write a shorter memory text or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.") raise OverflowError("The memory in your story is too long. Please either write a shorter memory text or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
witokens = tokenizer.encode(utils.encodenewlines(winfo), max_length=int(2e9), truncation=True) witokens = tokenizer.encode(utils.encodenewlines(winfo), max_length=int(2e9), truncation=True)
lnwi = len(witokens) lnwi = len(witokens)
if(lnmem + lnwi > vars.max_length - lnsp - vars.genamt - budget_deduction): if(lnmem + lnwi > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
raise OverflowError("The current active world info keys take up too many tokens. Please either write shorter world info, decrease World Info Depth or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.") raise OverflowError("The current active world info keys take up too many tokens. Please either write shorter world info, decrease World Info Depth or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
if(anotetxt != ""): if(anotetxt != ""):
anotetkns = tokenizer.encode(utils.encodenewlines(anotetxt), max_length=int(2e9), truncation=True) anotetkns = tokenizer.encode(utils.encodenewlines(anotetxt), max_length=int(2e9), truncation=True)
lnanote = len(anotetkns) lnanote = len(anotetkns)
if(lnmem + lnwi + lnanote > vars.max_length - lnsp - vars.genamt - budget_deduction): if(lnmem + lnwi + lnanote > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
raise OverflowError("The author's note in your story is too long. Please either write a shorter author's note or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.") raise OverflowError("The author's note in your story is too long. Please either write a shorter author's note or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
if(vars.useprompt): if(vars.useprompt):
@ -3622,14 +3650,14 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
lnsubmission = len(tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', submission)), max_length=int(2e9), truncation=True)) if submission is not None else 0 lnsubmission = len(tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', submission)), max_length=int(2e9), truncation=True)) if submission is not None else 0
maybe_lnprompt = lnprompt if vars.useprompt and actionlen > 0 else 0 maybe_lnprompt = lnprompt if vars.useprompt and actionlen > 0 else 0
if(lnmem + lnwi + lnanote + maybe_lnprompt + lnsubmission > vars.max_length - lnsp - vars.genamt - budget_deduction): if(lnmem + lnwi + lnanote + maybe_lnprompt + lnsubmission > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
raise OverflowError("Your submission is too long. Please either write a shorter submission or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt. If you are using the Always Add Prompt setting, turning it off may help.") raise OverflowError("Your submission is too long. Please either write a shorter submission or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt. If you are using the Always Add Prompt setting, turning it off may help.")
assert budget >= 0 assert budget >= 0
if(actionlen == 0): if(actionlen == 0):
# First/Prompt action # First/Prompt action
tokens = memtokens + witokens + anotetkns + prompttkns tokens = tokenizer._koboldai_header + memtokens + witokens + anotetkns + prompttkns
assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
ln = len(tokens) + lnsp ln = len(tokens) + lnsp
return tokens, ln+1, ln+vars.genamt return tokens, ln+1, ln+vars.genamt
@ -3677,12 +3705,12 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
# Did we get to add the A.N.? If not, do it here # Did we get to add the A.N.? If not, do it here
if(anotetxt != ""): if(anotetxt != ""):
if((not anoteadded) or forceanote): if((not anoteadded) or forceanote):
tokens = memtokens + witokens + anotetkns + prompttkns + tokens tokens = tokenizer._koboldai_header + memtokens + witokens + anotetkns + prompttkns + tokens
else: else:
tokens = memtokens + witokens + prompttkns + tokens tokens = tokenizer._koboldai_header + memtokens + witokens + prompttkns + tokens
else: else:
# Prepend Memory, WI, and Prompt before action tokens # Prepend Memory, WI, and Prompt before action tokens
tokens = memtokens + witokens + prompttkns + tokens tokens = tokenizer._koboldai_header + memtokens + witokens + prompttkns + tokens
# Send completed bundle to generator # Send completed bundle to generator
assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
@ -5892,15 +5920,26 @@ if __name__ == "__main__":
vars.serverstarted = True vars.serverstarted = True
socketio.run(app, host='0.0.0.0', port=port) socketio.run(app, host='0.0.0.0', port=port)
else: else:
import webbrowser
webbrowser.open_new('http://localhost:{0}'.format(port))
print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
.format(colors.GREEN, port, colors.END))
vars.serverstarted = True
if args.unblock: if args.unblock:
import webbrowser
webbrowser.open_new('http://localhost:{0}'.format(port))
print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
.format(colors.GREEN, port, colors.END))
vars.serverstarted = True
socketio.run(app, port=port, host='0.0.0.0') socketio.run(app, port=port, host='0.0.0.0')
else: else:
socketio.run(app, port=port) try:
from flaskwebgui import FlaskUI
vars.serverstarted = True
vars.flaskwebgui = True
FlaskUI(app, socketio=socketio, start_server="flask-socketio", maximized=True, close_server_on_exit=True).run()
except:
import webbrowser
webbrowser.open_new('http://localhost:{0}'.format(port))
print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
.format(colors.GREEN, port, colors.END))
vars.serverstarted = True
socketio.run(app, port=port)
else: else:
general_startup() general_startup()

20
static/flask_web_gui.js Normal file
View File

@ -0,0 +1,20 @@
async function getRequest(url='') {
const response = await fetch(url, {
method: 'GET',
cache: 'no-cache'
})
}
document.addEventListener('DOMContentLoaded', function() {
let url = document.location
let route = "/flaskwebgui-keep-server-alive";
let interval_request = 3 * 1000; //sec
function keep_alive_server(){
getRequest(url + route);
}
setInterval(keep_alive_server, interval_request);
})

View File

@ -19,6 +19,9 @@
<script src="static/rangy-core.min.js"></script> <script src="static/rangy-core.min.js"></script>
<script src="static/application.js?ver=1.18.1b"></script> <script src="static/application.js?ver=1.18.1b"></script>
<script src="static/favicon.js"></script> <script src="static/favicon.js"></script>
{% if flaskwebgui %}
<script src="static/flask_web_gui.js"></script>
{% endif %}
</head> </head>
<body> <body>
<input type="file" id="remote-save-select" accept="application/json" style="display:none"> <input type="file" id="remote-save-select" accept="application/json" style="display:none">

View File

@ -1324,6 +1324,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
if(os.path.isdir(vars.custmodpth)): if(os.path.isdir(vars.custmodpth)):
try: try:
tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
except Exception as e:
pass
try:
tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
except Exception as e: except Exception as e:
try: try:
tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
@ -1336,6 +1340,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))): elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
try: try:
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
except Exception as e:
pass
try:
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
except Exception as e: except Exception as e:
try: try:
tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
@ -1348,6 +1356,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
else: else:
try: try:
tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
except Exception as e:
pass
try:
tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
except Exception as e: except Exception as e:
try: try:
tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache") tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")