andyfe's picture
Update app.py
620d0d1 verified
import json
import os
import shutil
import requests
import gradio as gr
from huggingface_hub import Repository, InferenceClient
HF_TOKEN = os.environ.get("HF_TOKEN", None)
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat"
STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"]
client = InferenceClient(
API_URL,
headers={"Authorization": f"Bearer {HF_TOKEN}"},
)
def query(bot_name, system_prompt, user_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
print(temperature, max_new_tokens, top_p, repetition_penalty)
seed = 42
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
stop_sequences=STOP_SEQUENCES,
do_sample=True,
seed=seed,
)
print(bot_name)
print(system_prompt)
print(user_prompt)
print('-' * 20)
prompt = f"System: {system_prompt}\nUser: {user_prompt}\n{bot_name}: "
stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
for stop_str in STOP_SEQUENCES:
if output.endswith(stop_str):
output = output[:-len(stop_str)]
output = output.rstrip()
#yield output
#yield output
print(output)
print('-' * 20)
return output
iface = gr.Interface(
query,
inputs=["text","text","text","text","text","text","text"],
outputs="text",
)
iface.queue()
iface.launch()