Merge branch 'united' into accelerate

2022-06-18 13:47:38 -04:00 · 2022-06-18 13:47:38 -04:00 · e143963161
parent 8bdf17f598 b209cf9868
commit e143963161
4 changed files with 91 additions and 17 deletions
--- a/aiserver.py
+++ b/aiserver.py
@ -80,6 +80,17 @@ def new_init(self, *args, **kwargs):
        self.ncols = 99
 tqdm.__init__ = new_init
 # Fix some issues with the OPT tokenizer
 from transformers import PreTrainedTokenizerBase
 old_pretrainedtokenizerbase_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__
@classmethod
 def new_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs):
    tokenizer = old_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs)
    tokenizer._koboldai_header = tokenizer.encode("")
    tokenizer.add_bos_token = False
    tokenizer.add_prefix_space = False
    return tokenizer
 PreTrainedTokenizerBase.from_pretrained = new_pretrainedtokenizerbase_from_pretrained
 #==================================================================#
 # Variables & Storage
@ -323,11 +334,12 @@ class vars:
    actionmode  = 1
    dynamicscan = False
    host        = False
    flaskwebgui = False
    nopromptgen = False
    rngpersist  = False
    nogenmod    = False
    welcome     = False  # Custom Welcome Text (False is default)
-    newlinemode = "n"
+    newlinemode = "ns"
    quiet       = False # If set will suppress any story text from being printed to the console (will only be seen on the client web page)
    debug       = False # If set to true, will send debug information to the client for display
    lazy_load   = True  # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
@ -649,7 +661,7 @@ def loadmodelsettings():
            js   = {}
    if vars.model_type == "xglm" or js.get("compat", "j") == "fairseq_lm":
        vars.newlinemode = "s"  # Default to </s> newline mode if using XGLM
-    if vars.model_type == "opt":
+    if vars.model_type == "opt" or vars.model_type == "bloom":
        vars.newlinemode = "ns"  # Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them
    vars.modelconfig = js
    if("badwordsids" in js):
@ -1800,6 +1812,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
                    if(os.path.isdir(vars.custmodpth)):
                        try:
                            tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
                        except Exception as e:
                            pass
                        try:
                            tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
                        except Exception as e:
                            try:
                                tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
@ -1814,6 +1830,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
                    elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
                        try:
                            tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
                        except Exception as e:
                            pass
                        try:
                            tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
                        except Exception as e:
                            try:
                                tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
@ -1841,6 +1861,10 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
                        try:
                            tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
                        except Exception as e:
                            pass
                        try:
                            tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
                        except Exception as e:
                            try:
                                tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
@ -2061,7 +2085,7 @@ def index():
    if 'new_ui' in request.args:
        return render_template('index_new.html', hide_ai_menu=args.noaimenu)
    else:
-        return render_template('index.html', hide_ai_menu=args.noaimenu)
+        return render_template('index.html', hide_ai_menu=args.noaimenu, flaskwebgui=vars.flaskwebgui)
@app.route('/favicon.ico')
 def favicon():
    return send_from_directory(app.root_path,
@ -2759,6 +2783,8 @@ def do_connect():
    emit('from_server', {'cmd': 'connected', 'smandelete': vars.smandelete, 'smanrename': vars.smanrename, 'modelname': getmodelname()})
    if(vars.host):
        emit('from_server', {'cmd': 'runs_remotely'})
    if(vars.flaskwebgui):
        emit('from_server', {'cmd': 'flaskwebgui'})
    if(vars.allowsp):
        emit('from_server', {'cmd': 'allowsp', 'data': vars.allowsp})
@ -3594,24 +3620,26 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
        global tokenizer
        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
    lnheader = len(tokenizer._koboldai_header)
    # Calculate token budget
    prompttkns = tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', vars.prompt)), max_length=int(2e9), truncation=True)
    lnprompt   = len(prompttkns)
    memtokens = tokenizer.encode(utils.encodenewlines(mem), max_length=int(2e9), truncation=True)
    lnmem     = len(memtokens)
-    if(lnmem > vars.max_length - lnsp - vars.genamt - budget_deduction):
+    if(lnmem > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
        raise OverflowError("The memory in your story is too long. Please either write a shorter memory text or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
    witokens  = tokenizer.encode(utils.encodenewlines(winfo), max_length=int(2e9), truncation=True)
    lnwi      = len(witokens)
-    if(lnmem + lnwi > vars.max_length - lnsp - vars.genamt - budget_deduction):
+    if(lnmem + lnwi > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
        raise OverflowError("The current active world info keys take up too many tokens. Please either write shorter world info, decrease World Info Depth or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
    if(anotetxt != ""):
        anotetkns = tokenizer.encode(utils.encodenewlines(anotetxt), max_length=int(2e9), truncation=True)
        lnanote   = len(anotetkns)
-        if(lnmem + lnwi + lnanote > vars.max_length - lnsp - vars.genamt - budget_deduction):
+        if(lnmem + lnwi + lnanote > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
            raise OverflowError("The author's note in your story is too long. Please either write a shorter author's note or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
    if(vars.useprompt):
@ -3622,14 +3650,14 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
    lnsubmission = len(tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', submission)), max_length=int(2e9), truncation=True)) if submission is not None else 0
    maybe_lnprompt = lnprompt if vars.useprompt and actionlen > 0 else 0
-    if(lnmem + lnwi + lnanote + maybe_lnprompt + lnsubmission > vars.max_length - lnsp - vars.genamt - budget_deduction):
+    if(lnmem + lnwi + lnanote + maybe_lnprompt + lnsubmission > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
        raise OverflowError("Your submission is too long. Please either write a shorter submission or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt. If you are using the Always Add Prompt setting, turning it off may help.")
    assert budget >= 0
    if(actionlen == 0):
        # First/Prompt action
-        tokens = memtokens + witokens + anotetkns + prompttkns
+        tokens = tokenizer._koboldai_header + memtokens + witokens + anotetkns + prompttkns
        assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
        ln = len(tokens) + lnsp
        return tokens, ln+1, ln+vars.genamt
@ -3677,12 +3705,12 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
        # Did we get to add the A.N.? If not, do it here
        if(anotetxt != ""):
            if((not anoteadded) or forceanote):
-                tokens = memtokens + witokens + anotetkns + prompttkns + tokens
+                tokens = tokenizer._koboldai_header + memtokens + witokens + anotetkns + prompttkns + tokens
            else:
-                tokens = memtokens + witokens + prompttkns + tokens
+                tokens = tokenizer._koboldai_header + memtokens + witokens + prompttkns + tokens
        else:
            # Prepend Memory, WI, and Prompt before action tokens
-            tokens = memtokens + witokens + prompttkns + tokens
+            tokens = tokenizer._koboldai_header + memtokens + witokens + prompttkns + tokens
        # Send completed bundle to generator
        assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
@ -5892,15 +5920,26 @@ if __name__ == "__main__":
        vars.serverstarted = True
        socketio.run(app, host='0.0.0.0', port=port)
    else:
        import webbrowser
        webbrowser.open_new('http://localhost:{0}'.format(port))
        print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
              .format(colors.GREEN, port, colors.END))
        vars.serverstarted = True
        if args.unblock:
            import webbrowser
            webbrowser.open_new('http://localhost:{0}'.format(port))
            print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
                  .format(colors.GREEN, port, colors.END))
            vars.serverstarted = True
            socketio.run(app, port=port, host='0.0.0.0')
        else:
-            socketio.run(app, port=port)
+            try:
                from flaskwebgui import FlaskUI
                vars.serverstarted = True
                vars.flaskwebgui = True
                FlaskUI(app, socketio=socketio, start_server="flask-socketio", maximized=True, close_server_on_exit=True).run()
            except:
                import webbrowser
                webbrowser.open_new('http://localhost:{0}'.format(port))
                print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
                        .format(colors.GREEN, port, colors.END))
                vars.serverstarted = True
                socketio.run(app, port=port)
 else:
    general_startup()
--- a/static/flask_web_gui.js
+++ b/static/flask_web_gui.js
@ -0,0 +1,20 @@
 async function getRequest(url='') {
    const response = await fetch(url, {
      method: 'GET', 
      cache: 'no-cache'
    })
 }
 document.addEventListener('DOMContentLoaded', function() {
 let url = document.location
 let route = "/flaskwebgui-keep-server-alive";
 let interval_request = 3 * 1000; //sec
 function keep_alive_server(){
    getRequest(url + route);
 }
 setInterval(keep_alive_server, interval_request);
 })
--- a/templates/index.html
+++ b/templates/index.html
@ -19,6 +19,9 @@
 	<script src="static/rangy-core.min.js"></script>
 	<script src="static/application.js?ver=1.18.1b"></script>
 	<script src="static/favicon.js"></script>
 	{% if flaskwebgui %}
 	<script src="static/flask_web_gui.js"></script>
 	{% endif %}
 </head>
 <body>
 	<input type="file" id="remote-save-select" accept="application/json" style="display:none">
--- a/tpu_mtj_backend.py
+++ b/tpu_mtj_backend.py
@ -1324,6 +1324,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
        if(os.path.isdir(vars.custmodpth)):
            try:
                tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
            except Exception as e:
                pass
            try:
                tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
            except Exception as e:
                try:
                    tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
@ -1336,6 +1340,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
        elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
            try:
                tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
            except Exception as e:
                pass
            try:
                tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
            except Exception as e:
                try:
                    tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
@ -1348,6 +1356,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
        else:
            try:
                tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
            except Exception as e:
                pass
            try:
                tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
            except Exception as e:
                try:
                    tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")