Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import bs4 | |
import lxml | |
import os | |
from huggingface_hub import InferenceClient,HfApi | |
import random | |
import json | |
import datetime | |
import xmltodict | |
from prompts import ( | |
GET_KEYWORD, | |
COMPRESS_HISTORY_PROMPT, | |
COMPRESS_DATA_PROMPT, | |
COMPRESS_DATA_PROMPT_SMALL, | |
PREFIX_ALT, | |
PREFIX, | |
) | |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") | |
reponame="Omnibus/tmp" | |
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/' | |
token_self = os.environ['HF_TOKEN'] | |
api=HfApi(token=token_self) | |
def parse_action(string: str): | |
print("PARSING:") | |
print(string) | |
assert string.startswith("action:") | |
idx = string.find("action_input=") | |
print(idx) | |
if idx == -1: | |
print ("idx == -1") | |
print (string[8:]) | |
return string[8:], None | |
print ("last return:") | |
print (string[8 : idx - 1]) | |
print (string[idx + 13 :].strip("'").strip('"')) | |
return string[8 : idx - 1], string[idx + 13 :].strip("'").strip('"') | |
MAX_HISTORY = 100 | |
MAX_DATA = 40000 | |
def format_prompt(message, history): | |
prompt = "<s>" | |
for user_prompt, bot_response in history: | |
prompt += f"[INST] {user_prompt} [/INST]" | |
prompt += f" {bot_response}</s> " | |
prompt += f"[INST] {message} [/INST]" | |
return prompt | |
def run_gpt( | |
prompt_template, | |
stop_tokens, | |
max_tokens, | |
seed, | |
purpose, | |
prefix_tog, | |
**prompt_kwargs, | |
): | |
timestamp=datetime.datetime.now() | |
print(seed) | |
generate_kwargs = dict( | |
temperature=0.9, | |
max_new_tokens=max_tokens, | |
top_p=0.95, | |
repetition_penalty=1.0, | |
do_sample=True, | |
seed=seed, | |
) | |
print(f'prefix_tog:: {prefix_tog}') | |
if prefix_tog == "normal": | |
content = PREFIX.format( | |
timestamp=timestamp, | |
purpose=purpose, | |
) + prompt_template.format(**prompt_kwargs) | |
if prefix_tog == "alternate": | |
content = PREFIX_ALT + prompt_template.format(**prompt_kwargs) | |
#formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) | |
#formatted_prompt = format_prompt(f'{content}', **prompt_kwargs['history']) | |
stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
resp = "" | |
for response in stream: | |
resp += response.token.text | |
#yield resp | |
return resp | |
def compress_data(c,purpose, task, history): | |
seed=random.randint(1,1000000000) | |
print (c) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out = [] | |
#out="" | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
hist = history[s:e] | |
print(f'hist::\n{hist}') | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=2048, | |
seed=seed, | |
purpose=purpose, | |
prefix_tog="normal", | |
task=task, | |
knowledge=new_history, | |
history=hist, | |
).strip("\n") | |
new_history = resp | |
print (resp) | |
out+=resp | |
e=e+chunk | |
s=s+chunk | |
''' | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=2048, | |
seed=seed, | |
purpose=purpose, | |
task=task, | |
knowledge=new_history, | |
history=result, | |
) | |
''' | |
print ("final" + resp) | |
history = "result: {}\n".format(resp) | |
return history | |
def get_records(inp,data): | |
key_box=[] | |
seed=random.randint(1,1000000000) | |
print(inp) | |
out = str(data) | |
rl = len(out) | |
print(f'rl:: {rl}') | |
c=1 | |
for i in str(out): | |
if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": | |
c +=1 | |
print (f'c:: {c}') | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
hist = out[s:e] | |
print(f'hist::\n{hist}') | |
resp = run_gpt( | |
GET_KEYWORD, | |
stop_tokens=[], | |
max_tokens=2048, | |
seed=seed, | |
purpose=inp, | |
prefix_tog="alternate", | |
task=inp, | |
knowledge=new_history, | |
history=hist, | |
).strip("\n") | |
new_history = resp | |
print (f'resp {z}::\n {resp}') | |
#out+=resp | |
e=e+chunk | |
s=s+chunk | |
yield "", [(inp,new_history)] | |
def get_key(inp,data): | |
key_box=[] | |
seed=random.randint(1,1000000000) | |
key_w = run_gpt( | |
GET_KEYWORD, | |
stop_tokens=[], | |
max_tokens=56, | |
seed=seed, | |
purpose=inp, | |
prefix_tog="normal", | |
task=inp, | |
).split("<")[0] | |
print(f'key_w::{key_w}') | |
if " " in key_w: | |
key_w=key_w.split(" ")[-1] | |
for i,ba in enumerate(data): | |
each_key=data[i].keys() | |
print(each_key) | |
for z,zz in enumerate(list(each_key)[0]): | |
#for f,ff in enumerate(data[i][zz]): | |
ea = data[i][list(each_key)[0]][z] | |
try: | |
if ea['title'] and key_w in ea['title']: | |
key_box.append(ea) | |
elif ea['description'] and key_w in ea['description']: | |
key_box.append(ea) | |
elif ea['link'] and key_w in ea['link']: | |
key_box.append(ea) | |
except Exception as e: | |
print(e) | |
print(key_box) | |
def summarize(inp,history,data=None): | |
json_box=[] | |
if inp == "": | |
inp = "Process this data" | |
#inp = format_prompt(inp,history) | |
task = "Compile a detailed report" | |
history.clear() | |
yield "",[(inp,"Working on it...")] | |
if data != "Error" and data != "": | |
print(inp) | |
out = str(data) | |
rl = len(out) | |
print(f'rl:: {rl}') | |
c=1 | |
for i in str(out): | |
if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": | |
c +=1 | |
print (f'c:: {c}') | |
#json_out = compress_data(c,inp,task,out) | |
#def compress_data(c,purpose, task, history): | |
purpose=inp | |
seed=random.randint(1,1000000000) | |
print (c) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
#out="" | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
mes= f'Working on data chunk: {s}:{e}' | |
hist = out[s:e] | |
print(f'hist::\n{hist}') | |
yield "", [(inp,f'{mes}\n{new_history}')] | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=[], | |
max_tokens=2048, | |
seed=seed, | |
purpose=purpose, | |
prefix_tog="normal", | |
task=task, | |
knowledge=new_history, | |
history=hist, | |
) | |
new_history = resp | |
print (resp) | |
out+=resp | |
e=e+chunk | |
s=s+chunk | |
#history = "preliminary result: {}\n".format(resp) | |
#yield "", (inp,f'{mes}\n{history}') | |
print ("final" + resp) | |
out_hist = "result:\n{}".format(resp) | |
#return history | |
yield "", [(inp,out_hist)] | |
out = str(out_hist) | |
rawp = out | |
else: | |
rawp = "Provide a valid data source" | |
history.append((inp,rawp)) | |
yield "", history | |
def find_rss(): | |
lod="" | |
out_box=[] | |
yield [],[(None,"loading sources")] | |
with open ('feeds.json','r') as j: | |
cont = json.loads(j.read()) | |
#print(cont) | |
for ea in cont: | |
#lod="" | |
print (ea['link']) | |
rss_url=ea['link'] | |
link_box=[] | |
r = requests.get(f'{rss_url}') | |
if r.status_code == 200: | |
try: | |
if ".json" in rss_url: | |
lod = json.loads(r.text) | |
if ".xml" in rss_url: | |
lod = xmltodict.parse(r.content) | |
if ".rss" in rss_url: | |
lod = xmltodict.parse(r.content) | |
else: | |
try: | |
lod = xmltodict.parse(r.content) | |
except Exception as e: | |
lod=f'{rss_url} ::ERROR:: {e}' | |
except Exception as e: | |
lod=f'{rss_url} ::ERROR:: {e}' | |
else: | |
lod = f'{rss_url} ::ERROR::COULD NOT CONNECT:: {r.status_code}' | |
pass | |
try: | |
print(lod['rss']['channel']['item'][0].keys()) | |
print(lod['rss'].keys()) | |
for i,ea in enumerate(lod['rss']['channel']['item']): | |
try: | |
r_link = ea['link'] | |
r_title = ea['title'] | |
r_description = ea['description'] | |
lods = {"title":r_title, "description":r_description,"link":r_link} | |
except Exception: | |
try: | |
r_link = ea['link'] | |
r_title = ea['source'] | |
r_description = 'No Description provided' | |
lods = {"title":r_title, "description":r_description,"link":r_link} | |
except Exception as e: | |
print(e) | |
pass | |
#lods = {"title":"ERROR", "description":{e},"link":"ERROR"} | |
""" | |
r_link = lod['rss']['channel']['item'][i]['link'] | |
r_title = lod['rss']['channel']['item'][i]['title'] | |
r_description = lod['rss']['channel']['item'][i]['description']""" | |
link_box.append(lods) | |
lod={lod['rss']['channel']['title']:link_box} | |
out_box.append(lod) | |
except Exception as e: | |
#print(f'{ea["source"]}') | |
#print(f'{ea["link"]}') | |
#lod = f'{rss_url} ::ERROR:: {e}' | |
print(f'Exception::{e}') | |
print(f'Exception::{ea.keys()}') | |
#out_box.append(lod) | |
#user_repo=save_data.split('datasets/',1)[1].split('/raw',1)[0] | |
timestamp=str(datetime.datetime.now()) | |
timename=timestamp.replace(" ","--").replace(":","-").replace(".","-") | |
json_object = json.dumps(out_box) | |
#json_object = json.dumps(out_box,indent=4) | |
with open("tmp1.json", "w") as outfile: | |
outfile.write(json_object) | |
api.upload_file( | |
path_or_fileobj="tmp1.json", | |
path_in_repo=f"/rss/{timename}.json", | |
repo_id=reponame, | |
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
token=token_self, | |
repo_type="dataset", | |
) | |
yield out_box,[(None,f'Source is current as of:\n{timestamp} UTC\n\nThe current Date and Time is:\n{timestamp} UTC')] | |
def load_data(): | |
yield None,[(None,f'Loading data source, please wait')] | |
f_ist = (api.list_repo_files(repo_id=reponame, repo_type="dataset")) | |
f_ist.sort(reverse=True) | |
print(f_ist) | |
r = requests.get(f'{save_data}{f_ist[0]}') | |
lod = json.loads(r.text) | |
timestamp=str(datetime.datetime.now()) | |
filename=f_ist[0].split("/")[1].split(".json")[0].replace("--"," ") | |
print (filename) | |
filename_start = filename.split(" ")[0] | |
filename_end = filename.split(" ")[1] | |
filename_end = filename_end.replace("-"[0],":").replace("-"[0],":").replace("-"[0],".") | |
#filename_end_far=filename_end.split(":")[2] | |
print (filename) | |
yield lod,[(None,f'Source is current as of:\n{filename_start} {filename_end} UTC\n\nThe current Date and Time is:\n{timestamp} UTC')] | |
with gr.Blocks() as app: | |
cb = gr.Chatbot(height=600, show_share_button=True, show_copy_button=True) | |
with gr.Row(): | |
inst = gr.Textbox(label="Instructions") | |
sub_btn=gr.Button("Submit") | |
with gr.Row(): | |
load_btn = gr.Button("Load RSS") | |
u_btn=gr.Button("Update [RSS Data]") | |
keyw = gr.Button("Use Keyword [Experimental]") | |
with gr.Row(): | |
out_json = gr.JSON() | |
fil = gr.Textbox() | |
keyw.click(get_records,[inst,out_json],[inst,cb]) | |
load_btn.click(load_data,None,[out_json,cb]) | |
u_btn.click(find_rss,None,[out_json,cb]) | |
sub_btn.click(summarize,[inst,cb,out_json],[inst,cb]) | |
app.queue(default_concurrency_limit=20).launch() | |