Asaad Almutareb commited on
Commit
edc0787
1 Parent(s): 7b1a83a

moved utils.py to a utils folder

Browse files
innovation_pathfinder_ai/utils/utils.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import datetime
3
+
4
+ from innovation_pathfinder_ai.utils import logger
5
+
6
+ logger = logger.get_console_logger("utils")
7
+
8
+ def create_wikipedia_urls_from_text(text):
9
+ """
10
+ Extracts page titles from a given text and constructs Wikipedia URLs for each title.
11
+
12
+ Args:
13
+ - text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
14
+
15
+ Returns:
16
+ - list: A list of Wikipedia URLs constructed from the extracted titles.
17
+ """
18
+ # Split the text into sections based on "Page:" prefix
19
+ sections = text.split("Page: ")
20
+ # Remove the first item if it's empty (in case the text starts with "Page:")
21
+ if sections[0].strip() == "":
22
+ sections = sections[1:]
23
+
24
+ urls = [] # Initialize an empty list to store the URLs
25
+ for section in sections:
26
+ # Extract the title, which is the string up to the first newline
27
+ title = section.split("\n", 1)[0]
28
+ # Replace spaces with underscores for the URL
29
+ url_title = title.replace(" ", "_")
30
+ # Construct the URL and add it to the list
31
+ url = f"https://en.wikipedia.org/wiki/{url_title}"
32
+ urls.append(url)
33
+ print(urls)
34
+
35
+ return urls
36
+
37
+ def extract_urls(data_list):
38
+ """
39
+ Extracts URLs from a list of of dictionaries.
40
+
41
+ Parameters:
42
+ - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
43
+
44
+ Returns:
45
+ - list: A list of URLs extracted from the dictionaries.
46
+ """
47
+ urls = []
48
+ print(data_list)
49
+ for item in data_list:
50
+ try:
51
+ # Find the start and end indices of the URL
52
+ lower_case = item.lower()
53
+ link_prefix = 'link: '
54
+ summary_prefix = ', summary:'
55
+ start_idx = lower_case.index(link_prefix) + len(link_prefix)
56
+ end_idx = lower_case.index(summary_prefix, start_idx)
57
+ # Extract the URL using the indices found
58
+ url = item[start_idx:end_idx]
59
+ urls.append(url)
60
+ except ValueError:
61
+ # Handles the case where 'link: ' or ', summary:' is not found in the string
62
+ print("Could not find a URL in the item:", item)
63
+ last_sources = urls[-3:]
64
+ return last_sources
65
+
66
+ def format_wiki_summaries(input_text):
67
+ """
68
+ Parses a given text containing page titles and summaries, formats them into a list of strings,
69
+ and appends Wikipedia URLs based on titles.
70
+
71
+ Parameters:
72
+ - input_text (str): A string containing titles and summaries separated by specific markers.
73
+
74
+ Returns:
75
+ - list: A list of formatted strings with titles, summaries, and Wikipedia URLs.
76
+ """
77
+ # Splitting the input text into individual records based on double newlines
78
+ records = input_text.split("\n\n")
79
+
80
+ formatted_records_with_urls = []
81
+ for record in records:
82
+ if "Page:" in record and "Summary:" in record:
83
+ title_line, summary_line = record.split("\n", 1) # Splitting only on the first newline
84
+ title = title_line.replace("Page: ", "").strip()
85
+ summary = summary_line.replace("Summary: ", "").strip()
86
+ # Replace spaces with underscores for the URL and construct the Wikipedia URL
87
+ url_title = title.replace(" ", "_")
88
+ wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}"
89
+ # Append formatted string with title, summary, and URL
90
+ formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format(
91
+ title=title, summary=summary, wikipedia_url=wikipedia_url)
92
+ formatted_records_with_urls.append(formatted_record)
93
+ else:
94
+ print("Record format error, skipping record:", record)
95
+
96
+ return formatted_records_with_urls
97
+
98
+ def format_arxiv_documents(documents):
99
+ """
100
+ Formats a list of document objects into a list of strings.
101
+ Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID',
102
+ and a 'page_content' attribute for content.
103
+
104
+ Parameters:
105
+ - documents (list): A list of document objects.
106
+
107
+ Returns:
108
+ - list: A list of formatted strings with titles, links, and content snippets.
109
+ """
110
+ formatted_documents = [
111
+ "Title: {title}, Link: {link}, Summary: {snippet}".format(
112
+ title=doc.metadata['Title'],
113
+ link=doc.metadata['Entry ID'],
114
+ snippet=doc.page_content # Adjust the snippet length as needed
115
+ )
116
+ for doc in documents
117
+ ]
118
+ return formatted_documents
119
+
120
+ def format_search_results(search_results):
121
+ """
122
+ Formats a list of dictionaries containing search results into a list of strings.
123
+ Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
124
+
125
+ Parameters:
126
+ - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
127
+
128
+ Returns:
129
+ - list: A list of formatted strings based on the search results.
130
+ """
131
+ formatted_results = [
132
+ "Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
133
+ for i in search_results
134
+ ]
135
+ return formatted_results
136
+
137
+ def parse_list_to_dicts(items: list) -> list:
138
+ parsed_items = []
139
+ for item in items:
140
+ # Extract title, link, and summary from each string
141
+ title_start = item.find('Title: ') + len('Title: ')
142
+ link_start = item.find('Link: ') + len('Link: ')
143
+ summary_start = item.find('Summary: ') + len('Summary: ')
144
+
145
+ title_end = item.find(', Link: ')
146
+ link_end = item.find(', Summary: ')
147
+ summary_end = len(item)
148
+
149
+ title = item[title_start:title_end]
150
+ link = item[link_start:link_end]
151
+ summary = item[summary_start:summary_end]
152
+
153
+ # Use the hash_text function for the hash_id
154
+ hash_id = hash_text(link)
155
+
156
+ # Construct the dictionary for each item
157
+ parsed_item = {
158
+ "url": link,
159
+ "title": title,
160
+ "hash_id": hash_id,
161
+ "summary": summary
162
+ }
163
+ parsed_items.append(parsed_item)
164
+ return parsed_items
165
+
166
+ def hash_text(text: str) -> str:
167
+ return hashlib.md5(text.encode()).hexdigest()
168
+
169
+
170
+ def convert_timestamp_to_datetime(timestamp: str) -> str:
171
+ return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")