general_chat

Running

App Files Files Community

general_chat / helper_functions_api.py

pvanand

Update helper_functions_api.py

1f1d19b verified 7 months ago

raw

history blame

7.2 kB

	# !pip install mistune
	import mistune
	from mistune.plugins.table import table
	from jinja2 import Template
	import re
	import os

	def md_to_html(md_text):
	renderer = mistune.HTMLRenderer()
	markdown_renderer = mistune.Markdown(renderer, plugins=[table])
	html_content = markdown_renderer(md_text)
	return html_content.replace('\n', '')

	####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------####
	from datetime import datetime
	import psycopg2

	from dotenv import load_dotenv, find_dotenv

	# Load environment variables from .env file
	load_dotenv("keys.env")

	TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
	BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	HELICON_API_KEY = os.getenv("HELICON_API_KEY")
	SUPABASE_USER = os.environ['SUPABASE_USER']
	SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']

	def insert_data(user_id, user_query, subtopic_query, response, html_report):
	# Connect to your database
	conn = psycopg2.connect(
	dbname="postgres",
	user=SUPABASE_USER,
	password=SUPABASE_PASSWORD,
	host="aws-0-us-west-1.pooler.supabase.com",
	port="5432"
	)
	cur = conn.cursor()
	insert_query = """
	INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
	VALUES (%s, %s, %s, %s, %s, %s);
	"""
	cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now()))
	conn.commit()
	cur.close()
	conn.close()

	####-----------------------------------------------------END----------------------------------------------------------####


	import ast
	from fpdf import FPDF
	import re
	import pandas as pd
	import nltk
	import requests
	import json
	from retry import retry
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from bs4 import BeautifulSoup
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from brave import Brave
	from fuzzy_json import loads
	from half_json.core import JSONFixer
	from openai import OpenAI
	from together import Together

	llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
	llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"

	SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query"
	SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."

	import tiktoken # Used to limit tokens
	encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better

	def limit_tokens(input_string, token_limit=7500):
	"""
	Limit tokens sent to the model
	"""
	return encoding.decode(encoding.encode(input_string)[:token_limit])

	def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000):
	client = OpenAI(
	api_key=TOGETHER_API_KEY,
	base_url="https://together.hconeai.com/v1",
	default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"})

	messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}]

	response = client.chat.completions.create(
	model=model,
	messages=messages,
	temperature=temperature,
	frequency_penalty = frequency_penalty
	)
	return response.choices[0].message.content


	def json_from_text(text):
	"""
	Extracts JSON from text using regex and fuzzy JSON loading.
	"""
	match = re.search(r'\{[\s\S]*\}', text)
	if match:
	json_out = match.group(0)
	else:
	json_out = text
	try:
	# Using fuzzy json loader
	return loads(json_out)
	except Exception:
	# Using JSON fixer/ Fixes even half json/ Remove if you need an exception
	fix_json = JSONFixer()
	return loads(fix_json.fix(json_out).line)

	def remove_stopwords(text):
	stop_words = set(stopwords.words('english'))
	words = word_tokenize(text)
	filtered_text = [word for word in words if word.lower() not in stop_words]
	return ' '.join(filtered_text)

	def rephrase_content(data_format, content, query):

	if data_format == "Structured data":
	return together_response(
	f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \
	paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}",
	SysPrompt=SysPromptData,
	max_tokens=500,
	)
	elif data_format == "Quantitative data":
	return together_response(
	f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
	SysPrompt=SysPromptData,
	max_tokens=500,
	)
	else:
	return together_response(
	f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
	SysPrompt=SysPromptData,
	max_tokens=500,
	)
	class Scraper:
	def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
	self.session = requests.Session()
	self.session.headers.update({"User-Agent": user_agent})

	@retry(tries=3, delay=1)
	def fetch_content(self, url):
	try:
	response = self.session.get(url, timeout=2)
	if response.status_code == 200:
	return response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching page content for {url}: {e}")
	return None

	def extract_main_content(html):
	if html:
	plain_text = ""
	soup = BeautifulSoup(html, 'lxml')
	for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
	plain_text += element.get_text(separator=" ", strip=True) + "\n"
	return plain_text
	return ""

	def process_content(data_format, url, query):
	scraper = Scraper()
	html_content = scraper.fetch_content(url)
	if html_content:
	content = extract_main_content(html_content)
	if content:
	rephrased_content = rephrase_content(
	data_format=data_format,
	content=limit_tokens(remove_stopwords(content), token_limit=1000),
	query=query,
	)
	return rephrased_content, url
	return "", url

	def fetch_and_extract_content(data_format, urls, query):
	with ThreadPoolExecutor(max_workers=len(urls)) as executor:
	future_to_url = {
	executor.submit(process_content, data_format, url, query): url
	for url in urls
	}
	all_text_with_urls = [future.result() for future in as_completed(future_to_url)]

	return all_text_with_urls


	def search_brave(query, num_results=5):

	brave = Brave(BRAVE_API_KEY)

	search_results = brave.search(q=query, count=num_results)

	return [url.__str__() for url in search_results.urls]