Spaces:
Sleeping
Sleeping
# !pip install mistune | |
import mistune | |
from mistune.plugins.table import table | |
from jinja2 import Template | |
import re | |
import os | |
def md_to_html(md_text): | |
renderer = mistune.HTMLRenderer() | |
markdown_renderer = mistune.Markdown(renderer, plugins=[table]) | |
html_content = markdown_renderer(md_text) | |
return html_content.replace('\n', '') | |
####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------#### | |
from datetime import datetime | |
import psycopg2 | |
from dotenv import load_dotenv, find_dotenv | |
# Load environment variables from .env file | |
load_dotenv("keys.env") | |
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY') | |
BRAVE_API_KEY = os.getenv('BRAVE_API_KEY') | |
GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
HELICON_API_KEY = os.getenv("HELICON_API_KEY") | |
SUPABASE_USER = os.environ['SUPABASE_USER'] | |
SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD'] | |
def insert_data(user_id, user_query, subtopic_query, response, html_report): | |
# Connect to your database | |
conn = psycopg2.connect( | |
dbname="postgres", | |
user=SUPABASE_USER, | |
password=SUPABASE_PASSWORD, | |
host="aws-0-us-west-1.pooler.supabase.com", | |
port="5432" | |
) | |
cur = conn.cursor() | |
insert_query = """ | |
INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at) | |
VALUES (%s, %s, %s, %s, %s, %s); | |
""" | |
cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now())) | |
conn.commit() | |
cur.close() | |
conn.close() | |
####-----------------------------------------------------END----------------------------------------------------------#### | |
import ast | |
from fpdf import FPDF | |
import re | |
import pandas as pd | |
import nltk | |
import requests | |
import json | |
from retry import retry | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from bs4 import BeautifulSoup | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from brave import Brave | |
from fuzzy_json import loads | |
from half_json.core import JSONFixer | |
from openai import OpenAI | |
from together import Together | |
llm_default_small = "meta-llama/Llama-3-8b-chat-hf" | |
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf" | |
SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query" | |
SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments." | |
import tiktoken # Used to limit tokens | |
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better | |
def limit_tokens(input_string, token_limit=7500): | |
""" | |
Limit tokens sent to the model | |
""" | |
return encoding.decode(encoding.encode(input_string)[:token_limit]) | |
def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000): | |
client = OpenAI( | |
api_key=TOGETHER_API_KEY, | |
base_url="https://together.hconeai.com/v1", | |
default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"}) | |
messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}] | |
response = client.chat.completions.create( | |
model=model, | |
messages=messages, | |
temperature=temperature, | |
frequency_penalty = frequency_penalty | |
) | |
return response.choices[0].message.content | |
def json_from_text(text): | |
""" | |
Extracts JSON from text using regex and fuzzy JSON loading. | |
""" | |
match = re.search(r'\{[\s\S]*\}', text) | |
if match: | |
json_out = match.group(0) | |
else: | |
json_out = text | |
try: | |
# Using fuzzy json loader | |
return loads(json_out) | |
except Exception: | |
# Using JSON fixer/ Fixes even half json/ Remove if you need an exception | |
fix_json = JSONFixer() | |
return loads(fix_json.fix(json_out).line) | |
def remove_stopwords(text): | |
stop_words = set(stopwords.words('english')) | |
words = word_tokenize(text) | |
filtered_text = [word for word in words if word.lower() not in stop_words] | |
return ' '.join(filtered_text) | |
def rephrase_content(data_format, content, query): | |
if data_format == "Structured data": | |
return together_response( | |
f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \ | |
paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}", | |
SysPrompt=SysPromptData, | |
max_tokens=500, | |
) | |
elif data_format == "Quantitative data": | |
return together_response( | |
f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}", | |
SysPrompt=SysPromptData, | |
max_tokens=500, | |
) | |
else: | |
return together_response( | |
f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}", | |
SysPrompt=SysPromptData, | |
max_tokens=500, | |
) | |
class Scraper: | |
def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"): | |
self.session = requests.Session() | |
self.session.headers.update({"User-Agent": user_agent}) | |
def fetch_content(self, url): | |
try: | |
response = self.session.get(url, timeout=2) | |
if response.status_code == 200: | |
return response.text | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching page content for {url}: {e}") | |
return None | |
def extract_main_content(html): | |
if html: | |
plain_text = "" | |
soup = BeautifulSoup(html, 'lxml') | |
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']): | |
plain_text += element.get_text(separator=" ", strip=True) + "\n" | |
return plain_text | |
return "" | |
def process_content(data_format, url, query): | |
scraper = Scraper() | |
html_content = scraper.fetch_content(url) | |
if html_content: | |
content = extract_main_content(html_content) | |
if content: | |
rephrased_content = rephrase_content( | |
data_format=data_format, | |
content=limit_tokens(remove_stopwords(content), token_limit=1000), | |
query=query, | |
) | |
return rephrased_content, url | |
return "", url | |
def fetch_and_extract_content(data_format, urls, query): | |
with ThreadPoolExecutor(max_workers=len(urls)) as executor: | |
future_to_url = { | |
executor.submit(process_content, data_format, url, query): url | |
for url in urls | |
} | |
all_text_with_urls = [future.result() for future in as_completed(future_to_url)] | |
return all_text_with_urls | |
def search_brave(query, num_results=5): | |
brave = Brave(BRAVE_API_KEY) | |
search_results = brave.search(q=query, count=num_results) | |
return [url.__str__() for url in search_results.urls] | |