Spaces:

Corvius
/

LLaMA-3.1-405B-Instruct

Runtime error

App Files Files Community

Corvius commited on Nov 2, 2024

Commit

82addc1

verified ·

1 Parent(s): 2438f67

ufckfcukfcukfckufckufcufckufckuc

Browse files

ufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckucufckfcukfcukfckufckufcufckufckuc

Files changed (1) hide show

app.py +57 -156

app.py CHANGED Viewed

@@ -3,39 +3,36 @@ import requests
 import json
 import os
 import datetime
-import time
 from requests.exceptions import RequestException
-api_keys_env = os.environ.get('API_KEYS')
-if api_keys_env:
-    API_KEYS = [key.strip() for key in api_keys_env.strip().split('\n') if key.strip()]
-else:
-    raise ValueError("all keez ded go kys")
 API_URL = os.environ.get('API_URL')
 DEFAULT_PARAMS = {
-    "temperature": 1.0,
-    "top_p": 1,
-    "top_k": 0,
-    "min_p": 0,
-    "top_a": 0.1,
     "frequency_penalty": 0,
     "presence_penalty": 0,
     "repetition_penalty": 1.1,
     "max_tokens": 512
 }
-# debug switches
-USER_LOGGING_ENABLED = False
-RESPONSE_LOGGING_ENABLED = True
 def get_timestamp():
     return datetime.datetime.now().strftime("%H:%M:%S")
-def predict(message, history, system_prompt, temperature, top_p, top_k, min_p, top_a,
-            frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
-    history_format = [{"role": "system", "content": system_prompt}] if system_prompt else []
     for human, assistant in history:
         history_format.append({"role": "user", "content": human})
         if assistant:
@@ -50,8 +47,6 @@ def predict(message, history, system_prompt, temperature, top_p, top_k, min_p, t
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
-        "min_p": min_p,
-        "top_a": top_a,
         "frequency_penalty": frequency_penalty,
         "presence_penalty": presence_penalty,
         "repetition_penalty": repetition_penalty,
@@ -65,134 +60,48 @@ def predict(message, history, system_prompt, temperature, top_p, top_k, min_p, t
             print(f"{param}={value}")
     data = {
-        "model": "meta-llama/llama-3.1-405b-instruct:free",
         "messages": history_format,
         "stream": True,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
-        "min_p": min_p,
-        "top_a": top_a,
         "frequency_penalty": frequency_penalty,
         "presence_penalty": presence_penalty,
         "repetition_penalty": repetition_penalty,
         "max_tokens": max_tokens
     }
-    api_key_index = 0
-    retries = 0
-    max_retries = len(API_KEYS) * 30
-    partial_message = ""
-    processing_count = 0
-    max_processing_attempts = 10
-    while retries < max_retries:
-        if stop_flag[0]:
-            return
-        current_api_key = API_KEYS[api_key_index]
-        headers = {
-            "Authorization": f"Bearer {current_api_key}",
-            "Content-Type": "application/json"
-        }
-        try:
-            response = requests.post(API_URL, headers=headers, data=json.dumps(data), stream=True)
-            if response.status_code == 200:
-                processing_count = 0
-                for line in response.iter_lines():
-                    if stop_flag[0]:
-                        response.close()
-                        return
-                    if not line:
-                        continue
                     line = line.decode('utf-8')
                     if RESPONSE_LOGGING_ENABLED:
                         print(f"API Response: {line}")
-                    if ": OPENROUTER PROCESSING" in line:
-                        processing_count += 1
-                        if processing_count >= max_processing_attempts:
-                            print("Too many processing attempts, cycling to next key...")
                             break
-                        continue
-                    if not line.startswith("data: "):
-                        continue
-                    if line.strip() == "data: [DONE]":
-                        response.close()
-                        if partial_message:
-                            yield partial_message
-                        return
-                    try:
-                        json_data = json.loads(line[6:])
-                        # Check for rate limit error
-                        if 'error' in json_data:
-                            error_msg = json_data.get('error', {}).get('message', '')
-                            if isinstance(error_msg, str):
-                                try:
-                                    error_obj = json.loads(error_msg)
-                                    if error_obj.get('error', {}).get('type') == 'rate_limit_exceeded':
-                                        print("Rate limit hit in streaming response, cycling keys...")
-                                        break
-                                except json.JSONDecodeError:
-                                    pass
-                            continue
-                        if 'choices' in json_data and json_data['choices']:
-                            delta = json_data['choices'][0]['delta']
-                            content = delta.get('content', '')
-                            if content:
-                                partial_message += content
-                                yield partial_message
-                    except json.JSONDecodeError as e:
-                        print(f"JSON decode error: {e}")
-                        continue
-                response.close()
-                if partial_message:
-                    return
-            elif response.status_code == 429:
-                print("Rate limit hit from status code, cycling keys...")
-                time.sleep(1)
-            elif response.status_code == 401:
-                print(f"Invalid API key {api_key_index}, cycling to next...")
-                api_key_index = (api_key_index + 1) % len(API_KEYS)
-                retries += 1
-                continue
-            else:
-                error_message = f"Error: Received status code {response.status_code} - {response.text}"
-                print(error_message)
-                if partial_message:
-                    yield partial_message + f"\n[Error: {error_message}]"
-                else:
-                    yield f"An error occurred: {error_message}"
-                return
-        except RequestException as e:
-            print(f"Request error: {e}")
-            if partial_message:
-                yield partial_message + f"\n[Error: {str(e)}]"
-            else:
-                yield f"An error occurred: {str(e)}"
-            return
-        retries += 1
-        api_key_index = (api_key_index + 1) % len(API_KEYS)
-        time.sleep(1)
-    if partial_message:
-        yield partial_message + "\n[Error: Maximum retries reached]"
-    else:
-        yield "Error: Maximum retries reached. Please try again later."
 def import_chat(custom_format_string):
     try:
@@ -238,7 +147,7 @@ with gr.Blocks(theme='gradio/monochrome') as demo:
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(value=[])
-            msg = gr.Textbox(label="Message (expect to wait minutes since the keys are getting 429'd like crazy 💀)")
             with gr.Row():
                 clear = gr.Button("Clear")
                 regenerate = gr.Button("Regenerate")
@@ -252,29 +161,25 @@ with gr.Blocks(theme='gradio/monochrome') as demo:
         with gr.Column(scale=1):
             system_prompt = gr.Textbox("", label="System Prompt", lines=5)
-            temperature = gr.Slider(0, 2, value=DEFAULT_PARAMS["temperature"], step=0.01, label="Temperature")
-            top_p = gr.Slider(0, 1, value=DEFAULT_PARAMS["top_p"], step=0.01, label="Top P")
-            top_k = gr.Slider(0, 500, value=DEFAULT_PARAMS["top_k"], step=1, label="Top K")
-            min_p = gr.Slider(0, 1, value=DEFAULT_PARAMS["min_p"], step=0.01, label="Min P")
-            top_a = gr.Slider(0, 1, value=DEFAULT_PARAMS["top_a"], step=0.01, label="Top A")
-            frequency_penalty = gr.Slider(-2, 2, value=DEFAULT_PARAMS["frequency_penalty"], step=0.1, label="Frequency Penalty")
-            presence_penalty = gr.Slider(-2, 2, value=DEFAULT_PARAMS["presence_penalty"], step=0.1, label="Presence Penalty")
-            repetition_penalty = gr.Slider(0.01, 5, value=DEFAULT_PARAMS["repetition_penalty"], step=0.01, label="Repetition Penalty")
-            max_tokens = gr.Slider(1, 4096, value=DEFAULT_PARAMS["max_tokens"], step=1, label="Max Output (max_tokens)")
     def user(user_message, history):
         history = history or []
         return "", history + [[user_message, None]]
-    def bot(history, system_prompt, temperature, top_p, top_k, min_p, top_a,
-            frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
         stop_flag[0] = False
         history = history or []
         if not history:
             return history
         user_message = history[-1][0]
-        bot_message = predict(user_message, history[:-1], system_prompt, temperature, top_p, top_k, min_p, top_a,
-                              frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag)
         history[-1][1] = ""
         for chunk in bot_message:
             if stop_flag[0]:
@@ -283,13 +188,11 @@ with gr.Blocks(theme='gradio/monochrome') as demo:
             history[-1][1] = chunk
             yield history
-    def regenerate_response(history, system_prompt, temperature, top_p, top_k, min_p, top_a,
-                            frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
         if history and len(history) > 0:
             last_user_message = history[-1][0]
             history[-1][1] = None
-            for new_history in bot(history, system_prompt, temperature, top_p, top_k, min_p, top_a,
-                                   frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
                 yield new_history
         else:
             yield []
@@ -299,16 +202,14 @@ with gr.Blocks(theme='gradio/monochrome') as demo:
         return imported_history, imported_system_prompt
     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot, [chatbot, system_prompt, temperature, top_p, top_k, min_p, top_a,
-              frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag], chatbot
     )
     clear.click(lambda: None, None, chatbot, queue=False)
     regenerate.click(
         regenerate_response,
-        [chatbot, system_prompt, temperature, top_p, top_k, min_p, top_a,
-         frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag],
         chatbot
     )

 import json
 import os
 import datetime
 from requests.exceptions import RequestException
 API_URL = os.environ.get('API_URL')
+API_KEY = os.environ.get('API_KEY')
+headers = {
+    "Authorization": f"Bearer {API_KEY}",
+    "Content-Type": "application/json",
+    'Referer': os.environ.get('REFERRER_URL')
+}
+# debug switches
+USER_LOGGING_ENABLED = False
+RESPONSE_LOGGING_ENABLED = True
 DEFAULT_PARAMS = {
+    "temperature": 0.8,
+    "top_p": 0.95,
+    "top_k": 40,
     "frequency_penalty": 0,
     "presence_penalty": 0,
     "repetition_penalty": 1.1,
     "max_tokens": 512
 }
 def get_timestamp():
     return datetime.datetime.now().strftime("%H:%M:%S")
+def predict(message, history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
+    history_format = [{"role": "system", "content": system_prompt}]
     for human, assistant in history:
         history_format.append({"role": "user", "content": human})
         if assistant:
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
         "frequency_penalty": frequency_penalty,
         "presence_penalty": presence_penalty,
         "repetition_penalty": repetition_penalty,
             print(f"{param}={value}")
     data = {
+        "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
         "messages": history_format,
         "stream": True,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
         "frequency_penalty": frequency_penalty,
         "presence_penalty": presence_penalty,
         "repetition_penalty": repetition_penalty,
         "max_tokens": max_tokens
     }
+    try:
+        with requests.post(API_URL, headers=headers, data=json.dumps(data), stream=True) as response:
+            partial_message = ""
+            for line in response.iter_lines():
+                if stop_flag[0]:
+                    response.close()
+                    break
+                if line:
                     line = line.decode('utf-8')
                     if RESPONSE_LOGGING_ENABLED:
                         print(f"API Response: {line}")
+                    if line.startswith("data: "):
+                        if line.strip() == "data: [DONE]":
                             break
+                        try:
+                            json_data = json.loads(line[6:])
+                            if 'choices' in json_data and json_data['choices']:
+                                content = json_data['choices'][0]['delta'].get('content', '')
+                                if content:
+                                    partial_message += content
+                                    yield partial_message
+                        except json.JSONDecodeError:
+                            continue
+        if partial_message:
+            yield partial_message
+    except RequestException as e:
+        print(f"Request error: {e}")
+        yield f"An error occurred: {str(e)}"
 def import_chat(custom_format_string):
     try:
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(value=[])
+            msg = gr.Textbox(label="Message (dolphin-2.9.1-llama-3-70b for now. The provider might bug out at random. The space may restart frequently)")
             with gr.Row():
                 clear = gr.Button("Clear")
                 regenerate = gr.Button("Regenerate")
         with gr.Column(scale=1):
             system_prompt = gr.Textbox("", label="System Prompt", lines=5)
+            temperature = gr.Slider(0, 2, value=0.8, step=0.01, label="Temperature")
+            top_p = gr.Slider(0, 1, value=0.95, step=0.01, label="Top P")
+            top_k = gr.Slider(1, 500, value=40, step=1, label="Top K")
+            frequency_penalty = gr.Slider(-2, 2, value=0, step=0.1, label="Frequency Penalty")
+            presence_penalty = gr.Slider(-2, 2, value=0, step=0.1, label="Presence Penalty")
+            repetition_penalty = gr.Slider(0.01, 5, value=1.1, step=0.01, label="Repetition Penalty")
+            max_tokens = gr.Slider(1, 4096, value=512, step=1, label="Max Output (max_tokens)")
     def user(user_message, history):
         history = history or []
         return "", history + [[user_message, None]]
+    def bot(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
         stop_flag[0] = False
         history = history or []
         if not history:
             return history
         user_message = history[-1][0]
+        bot_message = predict(user_message, history[:-1], system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag)
         history[-1][1] = ""
         for chunk in bot_message:
             if stop_flag[0]:
             history[-1][1] = chunk
             yield history
+    def regenerate_response(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
         if history and len(history) > 0:
             last_user_message = history[-1][0]
             history[-1][1] = None
+            for new_history in bot(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag):
                 yield new_history
         else:
             yield []
         return imported_history, imported_system_prompt
     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, [chatbot, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag], chatbot
     )
     clear.click(lambda: None, None, chatbot, queue=False)
     regenerate.click(
         regenerate_response,
+        [chatbot, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens, stop_flag],
         chatbot
     )