pvanand commited on
Commit
1f1d19b
·
verified ·
1 Parent(s): f48a49c

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +36 -111
helper_functions_api.py CHANGED
@@ -70,20 +70,19 @@ from together import Together
70
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
71
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
72
 
73
- SysPromptJson = "You are now in the role of an expert AI who can extract structured information from user request. Both key and value pairs must be in double quotes. You must respond ONLY with a valid JSON file. Do not add any additional comments."
74
- SysPromptList = "You are now in the role of an expert AI who can extract structured information from user request. All elements must be in double quotes. You must respond ONLY with a valid python List. Do not add any additional comments."
75
  SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
76
 
77
  import tiktoken # Used to limit tokens
78
  encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
79
 
80
- def limit_tokens(input_string, token_limit=8000):
81
  """
82
  Limit tokens sent to the model
83
  """
84
  return encoding.decode(encoding.encode(input_string)[:token_limit])
85
 
86
- def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2):
87
  client = OpenAI(
88
  api_key=TOGETHER_API_KEY,
89
  base_url="https://together.hconeai.com/v1",
@@ -95,6 +94,7 @@ def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPromp
95
  model=model,
96
  messages=messages,
97
  temperature=temperature,
 
98
  )
99
  return response.choices[0].message.content
100
 
@@ -122,11 +122,27 @@ def remove_stopwords(text):
122
  filtered_text = [word for word in words if word.lower() not in stop_words]
123
  return ' '.join(filtered_text)
124
 
125
- def rephrase_content(content, query):
126
- return together_response(f"You are an information retriever and summarizer,ignore everything you know, return only the\
127
- factual information regarding the query: {{{query}}} into a maximum of {500} words. Output should be concise chunks of \
128
- paragraphs or tables or both, ignore links, using the scraped context:{{{content}}}")
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  class Scraper:
131
  def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
132
  self.session = requests.Session()
@@ -151,23 +167,31 @@ def extract_main_content(html):
151
  return plain_text
152
  return ""
153
 
154
- def process_content(url, query):
155
  scraper = Scraper()
156
  html_content = scraper.fetch_content(url)
157
  if html_content:
158
  content = extract_main_content(html_content)
159
  if content:
160
- rephrased_content = rephrase_content(limit_tokens(remove_stopwords(content)), query)
 
 
 
 
161
  return rephrased_content, url
162
  return "", url
163
 
164
- def fetch_and_extract_content(urls, query):
165
  with ThreadPoolExecutor(max_workers=len(urls)) as executor:
166
- future_to_url = {executor.submit(process_content, url, query): url for url in urls}
 
 
 
167
  all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
168
 
169
  return all_text_with_urls
170
 
 
171
  def search_brave(query, num_results=5):
172
 
173
  brave = Brave(BRAVE_API_KEY)
@@ -176,103 +200,4 @@ def search_brave(query, num_results=5):
176
 
177
  return [url.__str__() for url in search_results.urls]
178
 
179
- def generate_report_with_reference(full_data):
180
- """
181
- Generate HTML report with references and saves pdf report to "generated_pdf_report.pdf"
182
- """
183
- pdf = FPDF()
184
- with open("report_with_references_template.html") as f: # src/research-pro/app_v1.5_online/
185
- html_template = f.read()
186
-
187
- # Loop through each row in your dataset
188
- html_report = ''
189
- idx = 1
190
- for subtopic_data in full_data:
191
-
192
- md_report = md_to_html(subtopic_data['md_report'])
193
- # Convert the string representation of a list of tuples back to a list of tuples
194
- references = ast.literal_eval(subtopic_data['text_with_urls'])
195
-
196
- collapsible_blocks = []
197
- for ref_idx, reference in enumerate(references):
198
- ref_text = md_to_html(reference[0])
199
- ref_url = reference[1]
200
- urls_html = ''.join(f'<a href="{ref_url}"> {ref_url}</a>')
201
-
202
- collapsible_block = '''
203
- <details>
204
- <summary>Reference {}: {}</summary>
205
- <div>
206
- <p>{}</p>
207
- <ul>{}</ul>
208
- </div>
209
- </details>
210
- '''.format(ref_idx+1, urls_html, ref_text, urls_html)
211
-
212
- collapsible_blocks.append(collapsible_block)
213
-
214
- references_html = '\n'.join(collapsible_blocks)
215
-
216
- template = Template(html_template)
217
- html_page = template.render(md_report=md_report, references=references_html)
218
-
219
- pdf.add_page()
220
- pdf_report = f"<h1><strong>Report {idx}</strong></h1>"+md_report+f"<h1><strong>References for Report {idx}</strong></h1>"+references_html
221
-
222
- pdf.write_html(pdf_report.encode('ascii', 'ignore').decode('ascii')) # Filter non-asci characters
223
- html_report += html_page
224
- idx+=1
225
-
226
- pdf.output("generated_pdf_report.pdf")
227
- return html_report
228
-
229
- def write_dataframes_to_excel(dataframes_list, filename):
230
- """
231
- input: [df_list1, df_list2, ..]
232
- saves filename.xlsx
233
- """
234
- try:
235
- with pd.ExcelWriter(filename, engine="openpyxl") as writer:
236
- for idx, dataframes in enumerate(dataframes_list):
237
- startrow = 0
238
- for idx2, df in enumerate(dataframes):
239
- df.to_excel(writer, sheet_name=f"Sheet{idx+1}", startrow=startrow, index=False)
240
- startrow += len(df) + 2
241
- except:
242
- # Empty dataframe due to no tables found, file is not written
243
- pass
244
-
245
- def extract_tables_from_html(html_file):
246
- """
247
- input: html_file
248
- output: [df1,df2,df3,..]
249
- """
250
- # Initialize an empty list to store the dataframes
251
- dataframes = []
252
-
253
- # Open the HTML file and parse it with BeautifulSoup
254
- soup = BeautifulSoup(html_file, 'html.parser')
255
-
256
- # Find all the tables in the HTML file
257
- tables = soup.find_all('table')
258
-
259
- # Iterate through each table
260
- for table in tables:
261
- # Extract the table headers
262
- headers = [th.text for th in table.find_all('th')]
263
-
264
- # Extract the table data
265
- rows = table.find_all('tr')
266
- data = []
267
- for row in rows:
268
- row_data = [td.text for td in row.find_all('td')]
269
- data.append(row_data)
270
-
271
- # Create a dataframe from the headers and data
272
- df = pd.DataFrame(data, columns=headers)
273
-
274
- # Append the dataframe to the list of dataframes
275
- dataframes.append(df)
276
 
277
- # Return the list of dataframes
278
- return dataframes
 
70
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
71
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
72
 
73
+ SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query"
 
74
  SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
75
 
76
  import tiktoken # Used to limit tokens
77
  encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
78
 
79
+ def limit_tokens(input_string, token_limit=7500):
80
  """
81
  Limit tokens sent to the model
82
  """
83
  return encoding.decode(encoding.encode(input_string)[:token_limit])
84
 
85
+ def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000):
86
  client = OpenAI(
87
  api_key=TOGETHER_API_KEY,
88
  base_url="https://together.hconeai.com/v1",
 
94
  model=model,
95
  messages=messages,
96
  temperature=temperature,
97
+ frequency_penalty = frequency_penalty
98
  )
99
  return response.choices[0].message.content
100
 
 
122
  filtered_text = [word for word in words if word.lower() not in stop_words]
123
  return ' '.join(filtered_text)
124
 
125
+ def rephrase_content(data_format, content, query):
 
 
 
126
 
127
+ if data_format == "Structured data":
128
+ return together_response(
129
+ f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \
130
+ paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}",
131
+ SysPrompt=SysPromptData,
132
+ max_tokens=500,
133
+ )
134
+ elif data_format == "Quantitative data":
135
+ return together_response(
136
+ f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
137
+ SysPrompt=SysPromptData,
138
+ max_tokens=500,
139
+ )
140
+ else:
141
+ return together_response(
142
+ f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
143
+ SysPrompt=SysPromptData,
144
+ max_tokens=500,
145
+ )
146
  class Scraper:
147
  def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
148
  self.session = requests.Session()
 
167
  return plain_text
168
  return ""
169
 
170
+ def process_content(data_format, url, query):
171
  scraper = Scraper()
172
  html_content = scraper.fetch_content(url)
173
  if html_content:
174
  content = extract_main_content(html_content)
175
  if content:
176
+ rephrased_content = rephrase_content(
177
+ data_format=data_format,
178
+ content=limit_tokens(remove_stopwords(content), token_limit=1000),
179
+ query=query,
180
+ )
181
  return rephrased_content, url
182
  return "", url
183
 
184
+ def fetch_and_extract_content(data_format, urls, query):
185
  with ThreadPoolExecutor(max_workers=len(urls)) as executor:
186
+ future_to_url = {
187
+ executor.submit(process_content, data_format, url, query): url
188
+ for url in urls
189
+ }
190
  all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
191
 
192
  return all_text_with_urls
193
 
194
+
195
  def search_brave(query, num_results=5):
196
 
197
  brave = Brave(BRAVE_API_KEY)
 
200
 
201
  return [url.__str__() for url in search_results.urls]
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203