Upload 2 files
Browse filesdate and logging
- app.py +33 -48
- feed_to_llm_v2.py +85 -121
app.py
CHANGED
@@ -1,48 +1,33 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
|
4 |
-
import
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
)
|
35 |
-
|
36 |
-
return final_response
|
37 |
-
|
38 |
-
# Initialize and launch Gradio interface
|
39 |
-
gr.ChatInterface(
|
40 |
-
predict,
|
41 |
-
examples=[
|
42 |
-
"How many Americans Smoke?",
|
43 |
-
"What are some measures taken by the Indian Government to reduce the smoking population?",
|
44 |
-
"Does smoking negatively affect my health?"
|
45 |
-
],
|
46 |
-
title="Tobacco Information Assistant",
|
47 |
-
description="Ask questions about tobacco-related topics and get answers with reliable sources."
|
48 |
-
).launch()
|
|
|
1 |
+
|
2 |
+
import openai
|
3 |
+
import gradio as gr
|
4 |
+
from full_chain import get_response
|
5 |
+
import os
|
6 |
+
|
7 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
8 |
+
client = openai.OpenAI(api_key=api_key)
|
9 |
+
|
10 |
+
|
11 |
+
def create_hyperlink(url, title, domain):
|
12 |
+
return f"<a href='{url}'>{title}</a>" + " (" + domain + ")"
|
13 |
+
|
14 |
+
|
15 |
+
def predict(message, history):
|
16 |
+
print("get_responses: ")
|
17 |
+
# print(get_response(message, rerank_type="crossencoder"))
|
18 |
+
responder, links, titles, domains = get_response(message, rerank_type="crossencoder")
|
19 |
+
for i in range(len(links)):
|
20 |
+
links[i] = create_hyperlink(links[i], titles[i], domains[i])
|
21 |
+
|
22 |
+
out = responder + "\n" + "\n".join(links)
|
23 |
+
|
24 |
+
return out
|
25 |
+
|
26 |
+
|
27 |
+
gr.ChatInterface(predict,
|
28 |
+
examples = [
|
29 |
+
"How many Americans Smoke?",
|
30 |
+
"What are some measures taken by the Indian Government to reduce the smoking population?",
|
31 |
+
"Does smoking negatively affect my health?"
|
32 |
+
]
|
33 |
+
).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
feed_to_llm_v2.py
CHANGED
@@ -1,121 +1,85 @@
|
|
1 |
-
from langchain_openai import
|
2 |
-
|
3 |
-
from langchain.schema import (
|
4 |
-
HumanMessage,
|
5 |
-
SystemMessage
|
6 |
-
)
|
7 |
-
import tiktoken
|
8 |
-
import re
|
9 |
-
|
10 |
-
from get_articles import save_solr_articles_full
|
11 |
-
from rerank import crossencoder_rerank_answer
|
12 |
-
|
13 |
-
|
14 |
-
def num_tokens_from_string(string: str, encoder) -> int:
|
15 |
-
num_tokens = len(encoder.encode(string))
|
16 |
-
return num_tokens
|
17 |
-
|
18 |
-
|
19 |
-
def feed_articles_to_gpt_with_links(information, question):
|
20 |
-
prompt = """
|
21 |
-
You are a Question Answering
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
token_count
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
citations.append(citation)
|
87 |
-
|
88 |
-
# Replace all article references with citation numbers
|
89 |
-
modified_response = response_content
|
90 |
-
for original, citation_num in citation_map.items():
|
91 |
-
# Replace both inline and parenthetical references
|
92 |
-
modified_response = modified_response.replace(f"({original})", citation_num)
|
93 |
-
modified_response = modified_response.replace(original, citation_num)
|
94 |
-
|
95 |
-
# Format final response with citations
|
96 |
-
response_with_citations = (
|
97 |
-
f"{modified_response}\n\n"
|
98 |
-
f"References:\n"
|
99 |
-
f"{chr(10).join(citations)}"
|
100 |
-
)
|
101 |
-
|
102 |
-
# Prepare links only for cited articles
|
103 |
-
cited_links = []
|
104 |
-
cited_titles = []
|
105 |
-
cited_domains = []
|
106 |
-
for article_num in used_article_nums:
|
107 |
-
uuid = uuids[article_num]
|
108 |
-
link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
|
109 |
-
cited_links.append(link)
|
110 |
-
cited_titles.append(titles_list[article_num])
|
111 |
-
cited_domains.append(domains_list[article_num])
|
112 |
-
|
113 |
-
return response_with_citations, cited_links, cited_titles, cited_domains
|
114 |
-
|
115 |
-
if __name__ == "__main__":
|
116 |
-
question = "How is United States fighting against tobacco addiction?"
|
117 |
-
rerank_type = "crossencoder"
|
118 |
-
llm_type = "chat"
|
119 |
-
csv_path = save_solr_articles_full(question, keyword_type="rake")
|
120 |
-
reranked_out = crossencoder_rerank_answer(csv_path, question)
|
121 |
-
feed_articles_to_gpt_with_links(reranked_out, question)
|
|
|
1 |
+
from langchain_openai import OpenAI
|
2 |
+
|
3 |
+
from langchain.schema import (
|
4 |
+
HumanMessage,
|
5 |
+
SystemMessage
|
6 |
+
)
|
7 |
+
import tiktoken
|
8 |
+
import re
|
9 |
+
|
10 |
+
from get_articles import save_solr_articles_full
|
11 |
+
from rerank import crossencoder_rerank_answer
|
12 |
+
|
13 |
+
|
14 |
+
def num_tokens_from_string(string: str, encoder) -> int:
|
15 |
+
num_tokens = len(encoder.encode(string))
|
16 |
+
return num_tokens
|
17 |
+
|
18 |
+
|
19 |
+
def feed_articles_to_gpt_with_links(information, question):
|
20 |
+
prompt = """
|
21 |
+
You are a Question Answering machine specialized in providing information on tobacco-related queries. You have access to a curated list of articles that span various aspects of tobacco use, health effects, legislation, and quitting resources. When responding to questions, follow these guidelines:
|
22 |
+
|
23 |
+
1. Use information from the articles to formulate your answers. Indicate the article number you're referencing at the end of your response.
|
24 |
+
2. If the question's answer is not covered by your articles, clearly state that you do not know the answer. Do not attempt to infer or make up information.
|
25 |
+
3. Avoid using time-relative terms like 'last year,' 'recently,' etc., as the articles' publication dates and the current date may not align. Instead, use absolute terms (e.g., 'In 2022,' 'As of the article's 2020 publication,').
|
26 |
+
4. Aim for concise, informative responses that directly address the question asked.
|
27 |
+
|
28 |
+
Remember, your goal is to provide accurate, helpful information on tobacco-related topics, aiding in education and informed decision-making.
|
29 |
+
"""
|
30 |
+
end_prompt = "\n----------------\n"
|
31 |
+
prompt += end_prompt
|
32 |
+
content = ""
|
33 |
+
seperator = "<<<<>>>>"
|
34 |
+
|
35 |
+
token_count = 0
|
36 |
+
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
37 |
+
token_count += num_tokens_from_string(prompt, encoder)
|
38 |
+
|
39 |
+
articles = [contents for score, contents, uuids, titles, domains in information]
|
40 |
+
uuids = [uuids for score, contents, uuids, titles, domains in information]
|
41 |
+
domains = [domains for score, contents, uuids, titles, domains in information]
|
42 |
+
|
43 |
+
for i in range(len(articles)):
|
44 |
+
addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
|
45 |
+
addition += articles[i] + seperator
|
46 |
+
token_count += num_tokens_from_string(addition, encoder)
|
47 |
+
if token_count > 3500:
|
48 |
+
print(i)
|
49 |
+
break
|
50 |
+
|
51 |
+
content += addition
|
52 |
+
|
53 |
+
prompt += content
|
54 |
+
llm = OpenAI(model_name="gpt-4o-mini", temperature=0.0)
|
55 |
+
message = [
|
56 |
+
SystemMessage(content=prompt),
|
57 |
+
HumanMessage(content=question)
|
58 |
+
]
|
59 |
+
|
60 |
+
response = llm.invoke(message)
|
61 |
+
print(response)
|
62 |
+
print("response length:", len(response))
|
63 |
+
source = re.findall('\((.*?)\)', response)[-1]
|
64 |
+
|
65 |
+
# get integers from source
|
66 |
+
source = re.findall(r'\d+', source)
|
67 |
+
used_article_num = [int(i) - 1 for i in source]
|
68 |
+
|
69 |
+
links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
|
70 |
+
titles = [titles for score, contents, uuids, titles, domains in information]
|
71 |
+
|
72 |
+
links = [links[i] for i in used_article_num]
|
73 |
+
titles = [titles[i] for i in used_article_num]
|
74 |
+
domains = [domains[i] for i in used_article_num]
|
75 |
+
|
76 |
+
response_without_source = re.sub("""\(Article.*\)""", "", response)
|
77 |
+
return response_without_source, links, titles, domains
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
question = "How is United States fighting against tobacco addiction?"
|
81 |
+
rerank_type = "crossencoder"
|
82 |
+
llm_type = "chat"
|
83 |
+
csv_path = save_solr_articles_full(question, keyword_type="rake")
|
84 |
+
reranked_out = crossencoder_rerank_answer(csv_path, question)
|
85 |
+
feed_articles_to_gpt_with_links(reranked_out, question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|