# !pip install mistune import mistune from mistune.plugins.table import table from jinja2 import Template import re import os def md_to_html(md_text): renderer = mistune.HTMLRenderer() markdown_renderer = mistune.Markdown(renderer, plugins=[table]) html_content = markdown_renderer(md_text) return html_content.replace('\n', '') ####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------#### from datetime import datetime import psycopg2 from dotenv import load_dotenv, find_dotenv # Load environment variables from .env file load_dotenv("keys.env") TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY') BRAVE_API_KEY = os.getenv('BRAVE_API_KEY') GROQ_API_KEY = os.getenv("GROQ_API_KEY") HELICON_API_KEY = os.getenv("HELICON_API_KEY") SUPABASE_USER = os.environ['SUPABASE_USER'] SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD'] def insert_data(user_id, user_query, subtopic_query, response, html_report): # Connect to your database conn = psycopg2.connect( dbname="postgres", user=SUPABASE_USER, password=SUPABASE_PASSWORD, host="aws-0-us-west-1.pooler.supabase.com", port="5432" ) cur = conn.cursor() insert_query = """ INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at) VALUES (%s, %s, %s, %s, %s, %s); """ cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now())) conn.commit() cur.close() conn.close() ####-----------------------------------------------------END----------------------------------------------------------#### import ast from fpdf import FPDF import re import pandas as pd import nltk import requests import json from retry import retry from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from brave import Brave from fuzzy_json import loads from half_json.core import JSONFixer from openai import OpenAI from together import Together llm_default_small = "meta-llama/Llama-3-8b-chat-hf" llm_default_medium = "meta-llama/Llama-3-70b-chat-hf" SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query" SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments." import tiktoken # Used to limit tokens encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better def limit_tokens(input_string, token_limit=7500): """ Limit tokens sent to the model """ return encoding.decode(encoding.encode(input_string)[:token_limit]) def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000): client = OpenAI( api_key=TOGETHER_API_KEY, base_url="https://together.hconeai.com/v1", default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"}) messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}] response = client.chat.completions.create( model=model, messages=messages, temperature=temperature, frequency_penalty = frequency_penalty ) return response.choices[0].message.content def json_from_text(text): """ Extracts JSON from text using regex and fuzzy JSON loading. """ try: return json.loads(text) except: match = re.search(r'\{[\s\S]*\}', text) if match: json_out = match.group(0) else: json_out = text # Use Fuzzy JSON loading return loads(json_out) def remove_stopwords(text): stop_words = set(stopwords.words('english')) words = word_tokenize(text) filtered_text = [word for word in words if word.lower() not in stop_words] return ' '.join(filtered_text) def rephrase_content(data_format, content, query): if data_format == "Structured data": return together_response( f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \ paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}", SysPrompt=SysPromptData, max_tokens=500, ) elif data_format == "Quantitative data": return together_response( f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}", SysPrompt=SysPromptData, max_tokens=500, ) else: return together_response( f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}", SysPrompt=SysPromptData, max_tokens=500, ) class Scraper: def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"): self.session = requests.Session() self.session.headers.update({"User-Agent": user_agent}) @retry(tries=3, delay=1) def fetch_content(self, url): try: response = self.session.get(url, timeout=2) if response.status_code == 200: return response.text except requests.exceptions.RequestException as e: print(f"Error fetching page content for {url}: {e}") return None def extract_main_content(html): if html: plain_text = "" soup = BeautifulSoup(html, 'lxml') for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']): plain_text += element.get_text(separator=" ", strip=True) + "\n" return plain_text return "" def process_content(data_format, url, query): scraper = Scraper() html_content = scraper.fetch_content(url) if html_content: content = extract_main_content(html_content) if content: rephrased_content = rephrase_content( data_format=data_format, content=limit_tokens(remove_stopwords(content), token_limit=1000), query=query, ) return rephrased_content, url return "", url def fetch_and_extract_content(data_format, urls, query): with ThreadPoolExecutor(max_workers=len(urls)) as executor: future_to_url = { executor.submit(process_content, data_format, url, query): url for url in urls } all_text_with_urls = [future.result() for future in as_completed(future_to_url)] return all_text_with_urls def search_brave(query, num_results=5): brave = Brave(BRAVE_API_KEY) search_results = brave.search(q=query, count=num_results) return [url.__str__() for url in search_results.urls]