import gradio as gr import requests import bs4 import lxml import os from huggingface_hub import InferenceClient,HfApi import random import json import datetime import xmltodict from prompts import ( GET_KEYWORD, COMPRESS_HISTORY_PROMPT, COMPRESS_DATA_PROMPT, COMPRESS_DATA_PROMPT_SMALL, PREFIX_ALT, PREFIX, ) client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") reponame="Omnibus/tmp" save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/' token_self = os.environ['HF_TOKEN'] api=HfApi(token=token_self) def parse_action(string: str): print("PARSING:") print(string) assert string.startswith("action:") idx = string.find("action_input=") print(idx) if idx == -1: print ("idx == -1") print (string[8:]) return string[8:], None print ("last return:") print (string[8 : idx - 1]) print (string[idx + 13 :].strip("'").strip('"')) return string[8 : idx - 1], string[idx + 13 :].strip("'").strip('"') MAX_HISTORY = 100 MAX_DATA = 40000 def format_prompt(message, history): prompt = "~~" for user_prompt, bot_response in history: prompt += f"[INST] {user_prompt} [/INST]" prompt += f" {bot_response}~~ " prompt += f"[INST] {message} [/INST]" return prompt def run_gpt( prompt_template, stop_tokens, max_tokens, seed, purpose, prefix_tog, **prompt_kwargs, ): timestamp=datetime.datetime.now() print(seed) generate_kwargs = dict( temperature=0.9, max_new_tokens=max_tokens, top_p=0.95, repetition_penalty=1.0, do_sample=True, seed=seed, ) print(f'prefix_tog:: {prefix_tog}') if prefix_tog == "normal": content = PREFIX.format( timestamp=timestamp, purpose=purpose, ) + prompt_template.format(**prompt_kwargs) if prefix_tog == "alternate": content = PREFIX_ALT + prompt_template.format(**prompt_kwargs) #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) #formatted_prompt = format_prompt(f'{content}', **prompt_kwargs['history']) stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) resp = "" for response in stream: resp += response.token.text #yield resp return resp def compress_data(c,purpose, task, history): seed=random.randint(1,1000000000) print (c) divr=int(c)/MAX_DATA divi=int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) print(f'chunk:: {chunk}') print(f'divr:: {divr}') print (f'divi:: {divi}') out = [] #out="" s=0 e=chunk print(f'e:: {e}') new_history="" task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' for z in range(divi): print(f's:e :: {s}:{e}') hist = history[s:e] print(f'hist::\n{hist}') resp = run_gpt( COMPRESS_DATA_PROMPT, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=2048, seed=seed, purpose=purpose, prefix_tog="normal", task=task, knowledge=new_history, history=hist, ).strip("\n") new_history = resp print (resp) out+=resp e=e+chunk s=s+chunk ''' resp = run_gpt( COMPRESS_DATA_PROMPT, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=2048, seed=seed, purpose=purpose, task=task, knowledge=new_history, history=result, ) ''' print ("final" + resp) history = "result: {}\n".format(resp) return history def get_records(inp,data): key_box=[] seed=random.randint(1,1000000000) print(inp) out = str(data) rl = len(out) print(f'rl:: {rl}') c=1 for i in str(out): if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": c +=1 print (f'c:: {c}') divr=int(c)/MAX_DATA divi=int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) print(f'chunk:: {chunk}') print(f'divr:: {divr}') print (f'divi:: {divi}') s=0 e=chunk print(f'e:: {e}') new_history="" #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' for z in range(divi): print(f's:e :: {s}:{e}') hist = out[s:e] print(f'hist::\n{hist}') resp = run_gpt( GET_KEYWORD, stop_tokens=[], max_tokens=2048, seed=seed, purpose=inp, prefix_tog="alternate", task=inp, knowledge=new_history, history=hist, ).strip("\n") new_history = resp print (f'resp {z}::\n {resp}') #out+=resp e=e+chunk s=s+chunk yield "", [(inp,new_history)] def get_key(inp,data): key_box=[] seed=random.randint(1,1000000000) key_w = run_gpt( GET_KEYWORD, stop_tokens=[], max_tokens=56, seed=seed, purpose=inp, prefix_tog="normal", task=inp, ).split("<")[0] print(f'key_w::{key_w}') if " " in key_w: key_w=key_w.split(" ")[-1] for i,ba in enumerate(data): each_key=data[i].keys() print(each_key) for z,zz in enumerate(list(each_key)[0]): #for f,ff in enumerate(data[i][zz]): ea = data[i][list(each_key)[0]][z] try: if ea['title'] and key_w in ea['title']: key_box.append(ea) elif ea['description'] and key_w in ea['description']: key_box.append(ea) elif ea['link'] and key_w in ea['link']: key_box.append(ea) except Exception as e: print(e) print(key_box) def summarize(inp,history,data=None): json_box=[] if inp == "": inp = "Process this data" #inp = format_prompt(inp,history) task = "Compile a detailed report" history.clear() yield "",[(inp,"Working on it...")] if data != "Error" and data != "": print(inp) out = str(data) rl = len(out) print(f'rl:: {rl}') c=1 for i in str(out): if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": c +=1 print (f'c:: {c}') #json_out = compress_data(c,inp,task,out) #def compress_data(c,purpose, task, history): purpose=inp seed=random.randint(1,1000000000) print (c) divr=int(c)/MAX_DATA divi=int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) print(f'chunk:: {chunk}') print(f'divr:: {divr}') print (f'divi:: {divi}') #out="" s=0 e=chunk print(f'e:: {e}') new_history="" task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' for z in range(divi): print(f's:e :: {s}:{e}') mes= f'Working on data chunk: {s}:{e}' hist = out[s:e] print(f'hist::\n{hist}') yield "", [(inp,f'{mes}\n{new_history}')] resp = run_gpt( COMPRESS_DATA_PROMPT, stop_tokens=[], max_tokens=2048, seed=seed, purpose=purpose, prefix_tog="normal", task=task, knowledge=new_history, history=hist, ) new_history = resp print (resp) out+=resp e=e+chunk s=s+chunk #history = "preliminary result: {}\n".format(resp) #yield "", (inp,f'{mes}\n{history}') print ("final" + resp) out_hist = "result:\n{}".format(resp) #return history yield "", [(inp,out_hist)] out = str(out_hist) rawp = out else: rawp = "Provide a valid data source" history.append((inp,rawp)) yield "", history def find_rss(): lod="" out_box=[] yield [],[(None,"loading sources")] with open ('feeds.json','r') as j: cont = json.loads(j.read()) #print(cont) for ea in cont: #lod="" print (ea['link']) rss_url=ea['link'] link_box=[] r = requests.get(f'{rss_url}') if r.status_code == 200: try: if ".json" in rss_url: lod = json.loads(r.text) if ".xml" in rss_url: lod = xmltodict.parse(r.content) if ".rss" in rss_url: lod = xmltodict.parse(r.content) else: try: lod = xmltodict.parse(r.content) except Exception as e: lod=f'{rss_url} ::ERROR:: {e}' except Exception as e: lod=f'{rss_url} ::ERROR:: {e}' else: lod = f'{rss_url} ::ERROR::COULD NOT CONNECT:: {r.status_code}' pass try: print(lod['rss']['channel']['item'][0].keys()) print(lod['rss'].keys()) for i,ea in enumerate(lod['rss']['channel']['item']): try: r_link = ea['link'] r_title = ea['title'] r_description = ea['description'] lods = {"title":r_title, "description":r_description,"link":r_link} except Exception: try: r_link = ea['link'] r_title = ea['source'] r_description = 'No Description provided' lods = {"title":r_title, "description":r_description,"link":r_link} except Exception as e: print(e) pass #lods = {"title":"ERROR", "description":{e},"link":"ERROR"} """ r_link = lod['rss']['channel']['item'][i]['link'] r_title = lod['rss']['channel']['item'][i]['title'] r_description = lod['rss']['channel']['item'][i]['description']""" link_box.append(lods) lod={lod['rss']['channel']['title']:link_box} out_box.append(lod) except Exception as e: #print(f'{ea["source"]}') #print(f'{ea["link"]}') #lod = f'{rss_url} ::ERROR:: {e}' print(f'Exception::{e}') print(f'Exception::{ea.keys()}') #out_box.append(lod) #user_repo=save_data.split('datasets/',1)[1].split('/raw',1)[0] timestamp=str(datetime.datetime.now()) timename=timestamp.replace(" ","--").replace(":","-").replace(".","-") json_object = json.dumps(out_box) #json_object = json.dumps(out_box,indent=4) with open("tmp1.json", "w") as outfile: outfile.write(json_object) api.upload_file( path_or_fileobj="tmp1.json", path_in_repo=f"/rss/{timename}.json", repo_id=reponame, #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], token=token_self, repo_type="dataset", ) yield out_box,[(None,f'Source is current as of:\n{timestamp} UTC\n\nThe current Date and Time is:\n{timestamp} UTC')] def load_data(): yield None,[(None,f'Loading data source, please wait')] f_ist = (api.list_repo_files(repo_id=reponame, repo_type="dataset")) f_ist.sort(reverse=True) print(f_ist) r = requests.get(f'{save_data}{f_ist[0]}') lod = json.loads(r.text) timestamp=str(datetime.datetime.now()) filename=f_ist[0].split("/")[1].split(".json")[0].replace("--"," ") print (filename) filename_start = filename.split(" ")[0] filename_end = filename.split(" ")[1] filename_end = filename_end.replace("-"[0],":").replace("-"[0],":").replace("-"[0],".") #filename_end_far=filename_end.split(":")[2] print (filename) yield lod,[(None,f'Source is current as of:\n{filename_start} {filename_end} UTC\n\nThe current Date and Time is:\n{timestamp} UTC')] with gr.Blocks() as app: cb = gr.Chatbot(height=600, show_share_button=True, show_copy_button=True) with gr.Row(): inst = gr.Textbox(label="Instructions") sub_btn=gr.Button("Submit") with gr.Row(): load_btn = gr.Button("Load RSS") u_btn=gr.Button("Update [RSS Data]") keyw = gr.Button("Use Keyword [Experimental]") with gr.Row(): out_json = gr.JSON() fil = gr.Textbox() keyw.click(get_records,[inst,out_json],[inst,cb]) load_btn.click(load_data,None,[out_json,cb]) u_btn.click(find_rss,None,[out_json,cb]) sub_btn.click(summarize,[inst,cb,out_json],[inst,cb]) app.queue(default_concurrency_limit=20).launch()