File size: 3,945 Bytes
12cca3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from langchain.chat_models import ChatOpenAI

from langchain.schema import (
    HumanMessage,
    SystemMessage
)
import tiktoken
import re


def num_tokens_from_string(string: str, encoder) -> int:
    num_tokens = len(encoder.encode(string))
    return num_tokens


def feed_articles_to_gpt_with_links(information, question):
    prompt = "The following pieces of information includes relevant articles. \nUse the following sentences to answer question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer. "
    prompt += "Please state the number of the article used to answer the question after your response\n"
    end_prompt = "\n----------------\n"
    prompt += end_prompt
    content = ""
    seperator = "<<<<>>>>"

    token_count = 0
    encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
    token_count += num_tokens_from_string(prompt, encoder)

    articles = [contents for score, contents, uuids, titles, domains in information]
    uuids = [uuids for score, contents, uuids, titles, domains in information]
    domains = [domains for score, contents, uuids, titles, domains in information]

    for i in range(len(articles)):
        addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
        addition += articles[i] + seperator
        token_count += num_tokens_from_string(addition, encoder)
        if token_count > 3500:
            print(i)
            break

        content += addition

    prompt += content
    llm = ChatOpenAI(temperature=0.0)
    message = [
        SystemMessage(content=prompt),
        HumanMessage(content=question)
    ]

    response = llm(message)
    print(response.content)
    print("response length: ", len(response.content))

    answer_found_prompt = "Please check if the following response found the answer. If yes, return 1 and if no, return 0. \n"
    message = [
        SystemMessage(content=answer_found_prompt),
        HumanMessage(content=response.content)
    ]
    print(llm(message).content)
    if llm(message).content == "0":
        return "I could not find the answer.", [], [], []

    # sources = "\n Sources: \n"
    # for i in range(len(uuids)):
    #     link = "https://tobaccowatcher.globaltobaccocontrol.org/articles/" + uuids[i] + "/" + "\n"
    #     sources += link
    # response.content += sources

    lowercase_response = response.content.lower()
    # remove parentheses
    lowercase_response = re.sub('[()]', '', lowercase_response)
    lowercase_split = lowercase_response.split()
    used_article_num = []
    for i in range(len(lowercase_split)):
        if lowercase_split[i] == "article":
            next_word = lowercase_split[i + 1]
            # get rid of non-numenric characters
            next_word = ''.join(c for c in next_word if c.isdigit())
            print("Article number: ", next_word)
            # append only if it is not present in the list
            if next_word not in used_article_num:
                used_article_num.append(next_word)

    # if empty
    print("Used article num: ", used_article_num)
    if not used_article_num:
        print("I could not find the answer. Reached")
        return "I could not find the answer.", [], [], []

    used_article_num = [int(num) - 1 for num in used_article_num]

    links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
    titles = [titles for score, contents, uuids, titles, domains in information]

    links = [links[i] for i in used_article_num]
    titles = [titles[i] for i in used_article_num]
    domains = [domains[i] for i in used_article_num]

    # get rid of substring that starts with (Article and ends with )
    response_without_source = re.sub("""\(Article.*\)""", "", response.content)

    return response_without_source, links, titles, domains