Spaces:
Sleeping
Sleeping
import hashlib | |
import datetime | |
import os | |
import uuid | |
from app.utils import logger | |
logger = logger.get_console_logger("utils") | |
def create_wikipedia_urls_from_text(text): | |
""" | |
Extracts page titles from a given text and constructs Wikipedia URLs for each title. | |
Args: | |
- text (str): A string containing multiple sections, each starting with "Page:" followed by the title. | |
Returns: | |
- list: A list of Wikipedia URLs constructed from the extracted titles. | |
""" | |
# Split the text into sections based on "Page:" prefix | |
sections = text.split("Page: ") | |
# Remove the first item if it's empty (in case the text starts with "Page:") | |
if sections[0].strip() == "": | |
sections = sections[1:] | |
urls = [] # Initialize an empty list to store the URLs | |
for section in sections: | |
# Extract the title, which is the string up to the first newline | |
title = section.split("\n", 1)[0] | |
# Replace spaces with underscores for the URL | |
url_title = title.replace(" ", "_") | |
# Construct the URL and add it to the list | |
url = f"https://en.wikipedia.org/wiki/{url_title}" | |
urls.append(url) | |
#print(urls) | |
return urls | |
def extract_urls(data_list): | |
""" | |
Extracts URLs from a list of of dictionaries. | |
Parameters: | |
- formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'. | |
Returns: | |
- list: A list of URLs extracted from the dictionaries. | |
""" | |
urls = [] | |
print(data_list) | |
for item in data_list: | |
try: | |
# Find the start and end indices of the URL | |
lower_case = item.lower() | |
link_prefix = 'link: ' | |
summary_prefix = ', summary:' | |
start_idx = lower_case.index(link_prefix) + len(link_prefix) | |
end_idx = lower_case.index(summary_prefix, start_idx) | |
# Extract the URL using the indices found | |
url = item[start_idx:end_idx] | |
urls.append(url) | |
except ValueError: | |
# Handles the case where 'link: ' or ', summary:' is not found in the string | |
print("Could not find a URL in the item:", item) | |
last_sources = urls[-3:] | |
return last_sources | |
def format_wiki_summaries(input_text): | |
""" | |
Parses a given text containing page titles and summaries, formats them into a list of strings, | |
and appends Wikipedia URLs based on titles. | |
Parameters: | |
- input_text (str): A string containing titles and summaries separated by specific markers. | |
Returns: | |
- list: A list of formatted strings with titles, summaries, and Wikipedia URLs. | |
""" | |
# Splitting the input text into individual records based on double newlines | |
records = input_text.split("\n\n") | |
formatted_records_with_urls = [] | |
for record in records: | |
if "Page:" in record and "Summary:" in record: | |
title_line, summary_line = record.split("\n", 1) # Splitting only on the first newline | |
title = title_line.replace("Page: ", "").strip() | |
summary = summary_line.replace("Summary: ", "").strip() | |
# Replace spaces with underscores for the URL and construct the Wikipedia URL | |
url_title = title.replace(" ", "_") | |
wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}" | |
# Append formatted string with title, summary, and URL | |
formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format( | |
title=title, summary=summary, wikipedia_url=wikipedia_url) | |
formatted_records_with_urls.append(formatted_record) | |
else: | |
print("Record format error, skipping record:", record) | |
return formatted_records_with_urls | |
def format_arxiv_documents(documents): | |
""" | |
Formats a list of document objects into a list of strings. | |
Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID', | |
and a 'page_content' attribute for content. | |
Parameters: | |
- documents (list): A list of document objects. | |
Returns: | |
- list: A list of formatted strings with titles, links, and content snippets. | |
""" | |
formatted_documents = [ | |
"Title: {title}, Link: {link}, Summary: {snippet}".format( | |
title=doc.metadata['Title'], | |
link=doc.metadata['Entry ID'], | |
snippet=doc.page_content # Adjust the snippet length as needed | |
) | |
for doc in documents | |
] | |
return formatted_documents | |
def format_search_results(search_results): | |
""" | |
Formats a list of dictionaries containing search results into a list of strings. | |
Each dictionary is expected to have the keys 'title', 'link', and 'snippet'. | |
Parameters: | |
- search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'. | |
Returns: | |
- list: A list of formatted strings based on the search results. | |
""" | |
formatted_results = [ | |
"Title: {title}, Link: {link}, Summary: {snippet}".format(**i) | |
for i in search_results | |
] | |
return formatted_results | |
def parse_list_to_dicts(items: list) -> list: | |
parsed_items = [] | |
for item in items: | |
# Extract title, link, and summary from each string | |
title_start = item.find('Title: ') + len('Title: ') | |
link_start = item.find('Link: ') + len('Link: ') | |
summary_start = item.find('Summary: ') + len('Summary: ') | |
title_end = item.find(', Link: ') | |
link_end = item.find(', Summary: ') | |
summary_end = len(item) | |
title = item[title_start:title_end] | |
link = item[link_start:link_end] | |
summary = item[summary_start:summary_end] | |
# Use the hash_text function for the hash_id | |
hash_id = hash_text(link) | |
# Construct the dictionary for each item | |
parsed_item = { | |
"url": link, | |
"title": title, | |
"hash_id": hash_id, | |
"summary": summary | |
} | |
parsed_items.append(parsed_item) | |
return parsed_items | |
def hash_text(text: str) -> str: | |
return hashlib.md5(text.encode()).hexdigest() | |
def convert_timestamp_to_datetime(timestamp: str) -> str: | |
return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S") | |
def create_folder_if_not_exists(folder_path: str) -> None: | |
""" | |
Create a folder if it doesn't already exist. | |
Args: | |
- folder_path (str): The path of the folder to create. | |
""" | |
if not os.path.exists(folder_path): | |
os.makedirs(folder_path) | |
print(f"Folder '{folder_path}' created.") | |
else: | |
print(f"Folder '{folder_path}' already exists.") | |
def generate_uuid() -> str: | |
""" | |
Generate a UUID (Universally Unique Identifier) and return it as a string. | |
Returns: | |
str: A UUID string. | |
""" | |
return str(uuid.uuid4()) |