import hashlib import datetime import os import uuid from app.utils import logger logger = logger.get_console_logger("utils") def create_wikipedia_urls_from_text(text): """ Extracts page titles from a given text and constructs Wikipedia URLs for each title. Args: - text (str): A string containing multiple sections, each starting with "Page:" followed by the title. Returns: - list: A list of Wikipedia URLs constructed from the extracted titles. """ # Split the text into sections based on "Page:" prefix sections = text.split("Page: ") # Remove the first item if it's empty (in case the text starts with "Page:") if sections[0].strip() == "": sections = sections[1:] urls = [] # Initialize an empty list to store the URLs for section in sections: # Extract the title, which is the string up to the first newline title = section.split("\n", 1)[0] # Replace spaces with underscores for the URL url_title = title.replace(" ", "_") # Construct the URL and add it to the list url = f"https://en.wikipedia.org/wiki/{url_title}" urls.append(url) #print(urls) return urls def extract_urls(data_list): """ Extracts URLs from a list of of dictionaries. Parameters: - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'. Returns: - list: A list of URLs extracted from the dictionaries. """ urls = [] print(data_list) for item in data_list: try: # Find the start and end indices of the URL lower_case = item.lower() link_prefix = 'link: ' summary_prefix = ', summary:' start_idx = lower_case.index(link_prefix) + len(link_prefix) end_idx = lower_case.index(summary_prefix, start_idx) # Extract the URL using the indices found url = item[start_idx:end_idx] urls.append(url) except ValueError: # Handles the case where 'link: ' or ', summary:' is not found in the string print("Could not find a URL in the item:", item) last_sources = urls[-3:] return last_sources def format_wiki_summaries(input_text): """ Parses a given text containing page titles and summaries, formats them into a list of strings, and appends Wikipedia URLs based on titles. Parameters: - input_text (str): A string containing titles and summaries separated by specific markers. Returns: - list: A list of formatted strings with titles, summaries, and Wikipedia URLs. """ # Splitting the input text into individual records based on double newlines records = input_text.split("\n\n") formatted_records_with_urls = [] for record in records: if "Page:" in record and "Summary:" in record: title_line, summary_line = record.split("\n", 1) # Splitting only on the first newline title = title_line.replace("Page: ", "").strip() summary = summary_line.replace("Summary: ", "").strip() # Replace spaces with underscores for the URL and construct the Wikipedia URL url_title = title.replace(" ", "_") wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}" # Append formatted string with title, summary, and URL formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format( title=title, summary=summary, wikipedia_url=wikipedia_url) formatted_records_with_urls.append(formatted_record) else: print("Record format error, skipping record:", record) return formatted_records_with_urls def format_arxiv_documents(documents): """ Formats a list of document objects into a list of strings. Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID', and a 'page_content' attribute for content. Parameters: - documents (list): A list of document objects. Returns: - list: A list of formatted strings with titles, links, and content snippets. """ formatted_documents = [ "Title: {title}, Link: {link}, Summary: {snippet}".format( title=doc.metadata['Title'], link=doc.metadata['Entry ID'], snippet=doc.page_content # Adjust the snippet length as needed ) for doc in documents ] return formatted_documents def format_search_results(search_results): """ Formats a list of dictionaries containing search results into a list of strings. Each dictionary is expected to have the keys 'title', 'link', and 'snippet'. Parameters: - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'. Returns: - list: A list of formatted strings based on the search results. """ formatted_results = [ "Title: {title}, Link: {link}, Summary: {snippet}".format(**i) for i in search_results ] return formatted_results def parse_list_to_dicts(items: list) -> list: parsed_items = [] for item in items: # Extract title, link, and summary from each string title_start = item.find('Title: ') + len('Title: ') link_start = item.find('Link: ') + len('Link: ') summary_start = item.find('Summary: ') + len('Summary: ') title_end = item.find(', Link: ') link_end = item.find(', Summary: ') summary_end = len(item) title = item[title_start:title_end] link = item[link_start:link_end] summary = item[summary_start:summary_end] # Use the hash_text function for the hash_id hash_id = hash_text(link) # Construct the dictionary for each item parsed_item = { "url": link, "title": title, "hash_id": hash_id, "summary": summary } parsed_items.append(parsed_item) return parsed_items def hash_text(text: str) -> str: return hashlib.md5(text.encode()).hexdigest() def convert_timestamp_to_datetime(timestamp: str) -> str: return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S") def create_folder_if_not_exists(folder_path: str) -> None: """ Create a folder if it doesn't already exist. Args: - folder_path (str): The path of the folder to create. """ if not os.path.exists(folder_path): os.makedirs(folder_path) print(f"Folder '{folder_path}' created.") else: print(f"Folder '{folder_path}' already exists.") def generate_uuid() -> str: """ Generate a UUID (Universally Unique Identifier) and return it as a string. Returns: str: A UUID string. """ return str(uuid.uuid4())