code-execution-files / helper_functions_api.py
pvanand's picture
Update helper_functions_api.py
1f1d19b verified
raw
history blame
7.2 kB
# !pip install mistune
import mistune
from mistune.plugins.table import table
from jinja2 import Template
import re
import os
def md_to_html(md_text):
renderer = mistune.HTMLRenderer()
markdown_renderer = mistune.Markdown(renderer, plugins=[table])
html_content = markdown_renderer(md_text)
return html_content.replace('\n', '')
####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------####
from datetime import datetime
import psycopg2
from dotenv import load_dotenv, find_dotenv
# Load environment variables from .env file
load_dotenv("keys.env")
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HELICON_API_KEY = os.getenv("HELICON_API_KEY")
SUPABASE_USER = os.environ['SUPABASE_USER']
SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']
def insert_data(user_id, user_query, subtopic_query, response, html_report):
# Connect to your database
conn = psycopg2.connect(
dbname="postgres",
user=SUPABASE_USER,
password=SUPABASE_PASSWORD,
host="aws-0-us-west-1.pooler.supabase.com",
port="5432"
)
cur = conn.cursor()
insert_query = """
INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
VALUES (%s, %s, %s, %s, %s, %s);
"""
cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now()))
conn.commit()
cur.close()
conn.close()
####-----------------------------------------------------END----------------------------------------------------------####
import ast
from fpdf import FPDF
import re
import pandas as pd
import nltk
import requests
import json
from retry import retry
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from brave import Brave
from fuzzy_json import loads
from half_json.core import JSONFixer
from openai import OpenAI
from together import Together
llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query"
SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
import tiktoken # Used to limit tokens
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
def limit_tokens(input_string, token_limit=7500):
"""
Limit tokens sent to the model
"""
return encoding.decode(encoding.encode(input_string)[:token_limit])
def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000):
client = OpenAI(
api_key=TOGETHER_API_KEY,
base_url="https://together.hconeai.com/v1",
default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"})
messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}]
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
frequency_penalty = frequency_penalty
)
return response.choices[0].message.content
def json_from_text(text):
"""
Extracts JSON from text using regex and fuzzy JSON loading.
"""
match = re.search(r'\{[\s\S]*\}', text)
if match:
json_out = match.group(0)
else:
json_out = text
try:
# Using fuzzy json loader
return loads(json_out)
except Exception:
# Using JSON fixer/ Fixes even half json/ Remove if you need an exception
fix_json = JSONFixer()
return loads(fix_json.fix(json_out).line)
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
filtered_text = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_text)
def rephrase_content(data_format, content, query):
if data_format == "Structured data":
return together_response(
f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \
paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}",
SysPrompt=SysPromptData,
max_tokens=500,
)
elif data_format == "Quantitative data":
return together_response(
f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
SysPrompt=SysPromptData,
max_tokens=500,
)
else:
return together_response(
f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
SysPrompt=SysPromptData,
max_tokens=500,
)
class Scraper:
def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
self.session = requests.Session()
self.session.headers.update({"User-Agent": user_agent})
@retry(tries=3, delay=1)
def fetch_content(self, url):
try:
response = self.session.get(url, timeout=2)
if response.status_code == 200:
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching page content for {url}: {e}")
return None
def extract_main_content(html):
if html:
plain_text = ""
soup = BeautifulSoup(html, 'lxml')
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
plain_text += element.get_text(separator=" ", strip=True) + "\n"
return plain_text
return ""
def process_content(data_format, url, query):
scraper = Scraper()
html_content = scraper.fetch_content(url)
if html_content:
content = extract_main_content(html_content)
if content:
rephrased_content = rephrase_content(
data_format=data_format,
content=limit_tokens(remove_stopwords(content), token_limit=1000),
query=query,
)
return rephrased_content, url
return "", url
def fetch_and_extract_content(data_format, urls, query):
with ThreadPoolExecutor(max_workers=len(urls)) as executor:
future_to_url = {
executor.submit(process_content, data_format, url, query): url
for url in urls
}
all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
return all_text_with_urls
def search_brave(query, num_results=5):
brave = Brave(BRAVE_API_KEY)
search_results = brave.search(q=query, count=num_results)
return [url.__str__() for url in search_results.urls]