Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Running

App Files Files Community

tobacco-watcher-chat-with-citations / feed_to_llm.py

vtiyyal1

Upload 10 files

12cca3e verified 3 days ago

raw

history blame contribute delete

3.95 kB

	from langchain.chat_models import ChatOpenAI

	from langchain.schema import (
	HumanMessage,
	SystemMessage
	)
	import tiktoken
	import re


	def num_tokens_from_string(string: str, encoder) -> int:
	num_tokens = len(encoder.encode(string))
	return num_tokens


	def feed_articles_to_gpt_with_links(information, question):
	prompt = "The following pieces of information includes relevant articles. \nUse the following sentences to answer question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer. "
	prompt += "Please state the number of the article used to answer the question after your response\n"
	end_prompt = "\n----------------\n"
	prompt += end_prompt
	content = ""
	seperator = "<<<<>>>>"

	token_count = 0
	encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
	token_count += num_tokens_from_string(prompt, encoder)

	articles = [contents for score, contents, uuids, titles, domains in information]
	uuids = [uuids for score, contents, uuids, titles, domains in information]
	domains = [domains for score, contents, uuids, titles, domains in information]

	for i in range(len(articles)):
	addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
	addition += articles[i] + seperator
	token_count += num_tokens_from_string(addition, encoder)
	if token_count > 3500:
	print(i)
	break

	content += addition

	prompt += content
	llm = ChatOpenAI(temperature=0.0)
	message = [
	SystemMessage(content=prompt),
	HumanMessage(content=question)
	]

	response = llm(message)
	print(response.content)
	print("response length: ", len(response.content))

	answer_found_prompt = "Please check if the following response found the answer. If yes, return 1 and if no, return 0. \n"
	message = [
	SystemMessage(content=answer_found_prompt),
	HumanMessage(content=response.content)
	]
	print(llm(message).content)
	if llm(message).content == "0":
	return "I could not find the answer.", [], [], []

	# sources = "\n Sources: \n"
	# for i in range(len(uuids)):
	# link = "https://tobaccowatcher.globaltobaccocontrol.org/articles/" + uuids[i] + "/" + "\n"
	# sources += link
	# response.content += sources

	lowercase_response = response.content.lower()
	# remove parentheses
	lowercase_response = re.sub('[()]', '', lowercase_response)
	lowercase_split = lowercase_response.split()
	used_article_num = []
	for i in range(len(lowercase_split)):
	if lowercase_split[i] == "article":
	next_word = lowercase_split[i + 1]
	# get rid of non-numenric characters
	next_word = ''.join(c for c in next_word if c.isdigit())
	print("Article number: ", next_word)
	# append only if it is not present in the list
	if next_word not in used_article_num:
	used_article_num.append(next_word)

	# if empty
	print("Used article num: ", used_article_num)
	if not used_article_num:
	print("I could not find the answer. Reached")
	return "I could not find the answer.", [], [], []

	used_article_num = [int(num) - 1 for num in used_article_num]

	links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
	titles = [titles for score, contents, uuids, titles, domains in information]

	links = [links[i] for i in used_article_num]
	titles = [titles[i] for i in used_article_num]
	domains = [domains[i] for i in used_article_num]

	# get rid of substring that starts with (Article and ends with )
	response_without_source = re.sub("""\(Article.*\)""", "", response.content)

	return response_without_source, links, titles, domains