Asaad Almutareb
cleaned branch, added final streaming callback handler
fa99d8f
raw
history blame
No virus
6.91 kB
import hashlib
import datetime
import os
import uuid
from app.utils import logger
logger = logger.get_console_logger("utils")
def create_wikipedia_urls_from_text(text):
"""
Extracts page titles from a given text and constructs Wikipedia URLs for each title.
Args:
- text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
Returns:
- list: A list of Wikipedia URLs constructed from the extracted titles.
"""
# Split the text into sections based on "Page:" prefix
sections = text.split("Page: ")
# Remove the first item if it's empty (in case the text starts with "Page:")
if sections[0].strip() == "":
sections = sections[1:]
urls = [] # Initialize an empty list to store the URLs
for section in sections:
# Extract the title, which is the string up to the first newline
title = section.split("\n", 1)[0]
# Replace spaces with underscores for the URL
url_title = title.replace(" ", "_")
# Construct the URL and add it to the list
url = f"https://en.wikipedia.org/wiki/{url_title}"
urls.append(url)
#print(urls)
return urls
def extract_urls(data_list):
"""
Extracts URLs from a list of of dictionaries.
Parameters:
- formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
Returns:
- list: A list of URLs extracted from the dictionaries.
"""
urls = []
print(data_list)
for item in data_list:
try:
# Find the start and end indices of the URL
lower_case = item.lower()
link_prefix = 'link: '
summary_prefix = ', summary:'
start_idx = lower_case.index(link_prefix) + len(link_prefix)
end_idx = lower_case.index(summary_prefix, start_idx)
# Extract the URL using the indices found
url = item[start_idx:end_idx]
urls.append(url)
except ValueError:
# Handles the case where 'link: ' or ', summary:' is not found in the string
print("Could not find a URL in the item:", item)
last_sources = urls[-3:]
return last_sources
def format_wiki_summaries(input_text):
"""
Parses a given text containing page titles and summaries, formats them into a list of strings,
and appends Wikipedia URLs based on titles.
Parameters:
- input_text (str): A string containing titles and summaries separated by specific markers.
Returns:
- list: A list of formatted strings with titles, summaries, and Wikipedia URLs.
"""
# Splitting the input text into individual records based on double newlines
records = input_text.split("\n\n")
formatted_records_with_urls = []
for record in records:
if "Page:" in record and "Summary:" in record:
title_line, summary_line = record.split("\n", 1) # Splitting only on the first newline
title = title_line.replace("Page: ", "").strip()
summary = summary_line.replace("Summary: ", "").strip()
# Replace spaces with underscores for the URL and construct the Wikipedia URL
url_title = title.replace(" ", "_")
wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}"
# Append formatted string with title, summary, and URL
formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format(
title=title, summary=summary, wikipedia_url=wikipedia_url)
formatted_records_with_urls.append(formatted_record)
else:
print("Record format error, skipping record:", record)
return formatted_records_with_urls
def format_arxiv_documents(documents):
"""
Formats a list of document objects into a list of strings.
Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID',
and a 'page_content' attribute for content.
Parameters:
- documents (list): A list of document objects.
Returns:
- list: A list of formatted strings with titles, links, and content snippets.
"""
formatted_documents = [
"Title: {title}, Link: {link}, Summary: {snippet}".format(
title=doc.metadata['Title'],
link=doc.metadata['Entry ID'],
snippet=doc.page_content # Adjust the snippet length as needed
)
for doc in documents
]
return formatted_documents
def format_search_results(search_results):
"""
Formats a list of dictionaries containing search results into a list of strings.
Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
Parameters:
- search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
Returns:
- list: A list of formatted strings based on the search results.
"""
formatted_results = [
"Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
for i in search_results
]
return formatted_results
def parse_list_to_dicts(items: list) -> list:
parsed_items = []
for item in items:
# Extract title, link, and summary from each string
title_start = item.find('Title: ') + len('Title: ')
link_start = item.find('Link: ') + len('Link: ')
summary_start = item.find('Summary: ') + len('Summary: ')
title_end = item.find(', Link: ')
link_end = item.find(', Summary: ')
summary_end = len(item)
title = item[title_start:title_end]
link = item[link_start:link_end]
summary = item[summary_start:summary_end]
# Use the hash_text function for the hash_id
hash_id = hash_text(link)
# Construct the dictionary for each item
parsed_item = {
"url": link,
"title": title,
"hash_id": hash_id,
"summary": summary
}
parsed_items.append(parsed_item)
return parsed_items
def hash_text(text: str) -> str:
return hashlib.md5(text.encode()).hexdigest()
def convert_timestamp_to_datetime(timestamp: str) -> str:
return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
def create_folder_if_not_exists(folder_path: str) -> None:
"""
Create a folder if it doesn't already exist.
Args:
- folder_path (str): The path of the folder to create.
"""
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder '{folder_path}' created.")
else:
print(f"Folder '{folder_path}' already exists.")
def generate_uuid() -> str:
"""
Generate a UUID (Universally Unique Identifier) and return it as a string.
Returns:
str: A UUID string.
"""
return str(uuid.uuid4())