diff --git a/App_Function_Libraries/Article_Summarization_Lib.py b/App_Function_Libraries/Article_Summarization_Lib.py index 73fe9541ad9fac7047921a81e3ffd222edab3c02..6197e8af25ac477d25b987945b54af423694a5b9 100644 --- a/App_Function_Libraries/Article_Summarization_Lib.py +++ b/App_Function_Libraries/Article_Summarization_Lib.py @@ -1,292 +1,221 @@ -# Article_Summarization_Lib.py -######################################### -# Article Summarization Library -# This library is used to handle summarization of articles. - -# -#### -# -#################### -# Function List -# -# 1. -# -#################### -# -# Import necessary libraries -import datetime -from datetime import datetime -import gradio as gr -import json -import os -import logging -import requests -# 3rd-Party Imports -from tqdm import tqdm - -from App_Function_Libraries.Utils import sanitize_filename -# Local Imports -from Article_Extractor_Lib import scrape_article -from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \ - summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm -from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \ - summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \ - summarize_with_mistral -from SQLite_DB import Database, create_tables, add_media_with_keywords -# -####################################################################################################################### -# Function Definitions -# - -def ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt): - try: - # Check if content is not empty or whitespace - if not content.strip(): - raise ValueError("Content is empty.") - - db = Database() - create_tables() - keyword_list = keywords.split(',') if keywords else ["default"] - keyword_str = ', '.join(keyword_list) - - # Set default values for missing fields - url = url or 'Unknown' - title = title or 'Unknown' - author = author or 'Unknown' - keywords = keywords or 'default' - summary = summary or 'No summary available' - ingestion_date = ingestion_date or datetime.datetime.now().strftime('%Y-%m-%d') - - # Log the values of all fields before calling add_media_with_keywords - logging.debug(f"URL: {url}") - logging.debug(f"Title: {title}") - logging.debug(f"Author: {author}") - logging.debug(f"Content: {content[:50]}... (length: {len(content)})") # Log first 50 characters of content - logging.debug(f"Keywords: {keywords}") - logging.debug(f"Summary: {summary}") - logging.debug(f"Ingestion Date: {ingestion_date}") - logging.debug(f"Custom Prompt: {custom_prompt}") - - # Check if any required field is empty and log the specific missing field - if not url: - logging.error("URL is missing.") - raise ValueError("URL is missing.") - if not title: - logging.error("Title is missing.") - raise ValueError("Title is missing.") - if not content: - logging.error("Content is missing.") - raise ValueError("Content is missing.") - if not keywords: - logging.error("Keywords are missing.") - raise ValueError("Keywords are missing.") - if not summary: - logging.error("Summary is missing.") - raise ValueError("Summary is missing.") - if not ingestion_date: - logging.error("Ingestion date is missing.") - raise ValueError("Ingestion date is missing.") - if not custom_prompt: - logging.error("Custom prompt is missing.") - raise ValueError("Custom prompt is missing.") - - # Add media with keywords to the database - result = add_media_with_keywords( - url=url, - title=title, - media_type='article', - content=content, - keywords=keyword_str or "article_default", - prompt=custom_prompt or None, - summary=summary or "No summary generated", - transcription_model=None, # or some default value if applicable - author=author or 'Unknown', - ingestion_date=ingestion_date - ) - return result - except Exception as e: - logging.error(f"Failed to ingest article to the database: {e}") - return str(e) - - -def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None): - urls = [url.strip() for url in urls.split('\n') if url.strip()] - custom_titles = custom_article_titles.split('\n') if custom_article_titles else [] - - results = [] - errors = [] - - # Create a progress bar - progress = gr.Progress() - - for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"): - custom_title = custom_titles[i] if i < len(custom_titles) else None - try: - result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title, system_message) - results.append(f"Results for URL {i + 1}:\n{result}") - except Exception as e: - error_message = f"Error processing URL {i + 1} ({url}): {str(e)}" - errors.append(error_message) - results.append(f"Failed to process URL {i + 1}: {url}") - - # Update progress - progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs") - - # Combine results and errors - combined_output = "\n".join(results) - if errors: - combined_output += "\n\nErrors encountered:\n" + "\n".join(errors) - - return combined_output - - -def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None): - try: - # Step 1: Scrape the article - article_data = scrape_article(url) - print(f"Scraped Article Data: {article_data}") # Debugging statement - if not article_data: - return "Failed to scrape the article." - - # Use the custom title if provided, otherwise use the scraped title - title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled') - author = article_data.get('author', 'Unknown') - content = article_data.get('content', '') - ingestion_date = datetime.now().strftime('%Y-%m-%d') - - print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement - - # Custom system prompt for the article - system_message = system_message or "Act as a professional summarizer and summarize this article." - # Custom prompt for the article - article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article." - - # Step 2: Summarize the article - summary = None - if api_name: - logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}") - - # Sanitize filename for saving the JSON file - sanitized_title = sanitize_filename(title) - json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json") - - with open(json_file_path, 'w') as json_file: - json.dump([{'text': content}], json_file, indent=2) - - # FIXME - Swap out this if/else to use the dedicated function.... - try: - if api_name.lower() == 'openai': - # def summarize_with_openai(api_key, input_data, custom_prompt_arg) - summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "anthropic": - # def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5): - summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message) - elif api_name.lower() == "cohere": - # def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg) - summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "groq": - logging.debug(f"MAIN: Trying to summarize with groq") - # def summarize_with_groq(api_key, input_data, model, custom_prompt_arg): - summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "openrouter": - logging.debug(f"MAIN: Trying to summarize with OpenRouter") - # def summarize_with_openrouter(api_key, input_data, custom_prompt_arg): - summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "deepseek": - logging.debug(f"MAIN: Trying to summarize with DeepSeek") - # def summarize_with_deepseek(api_key, input_data, custom_prompt_arg): - summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "mistral": - summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "llama.cpp": - logging.debug(f"MAIN: Trying to summarize with Llama.cpp") - # def summarize_with_llama(api_url, file_path, token, custom_prompt) - summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "kobold": - logging.debug(f"MAIN: Trying to summarize with Kobold.cpp") - # def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url): - summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message) - - elif api_name.lower() == "ooba": - # def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url): - summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message) - - elif api_name.lower() == "tabbyapi": - # def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP): - summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "vllm": - logging.debug(f"MAIN: Trying to summarize with VLLM") - # def summarize_with_vllm(api_key, input_data, custom_prompt_input): - summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "local-llm": - logging.debug(f"MAIN: Trying to summarize with Local LLM") - summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "huggingface": - logging.debug(f"MAIN: Trying to summarize with huggingface") - # def summarize_with_huggingface(api_key, input_data, custom_prompt_arg): - summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message) - # Add additional API handlers here... - except requests.exceptions.ConnectionError as e: - logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}") - - if summary: - logging.info(f"Article_Summarizer: Summary generated using {api_name} API") - save_summary_to_file(summary, json_file_path) - else: - summary = "Summary not available" - logging.warning(f"Failed to generate summary using {api_name} API") - - else: - summary = "Article Summarization: No API provided for summarization." - - print(f"Summary: {summary}") # Debugging statement - - # Step 3: Ingest the article into the database - ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, - article_custom_prompt) - - return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}" - except Exception as e: - logging.error(f"Error processing URL {url}: {str(e)}") - return f"Failed to process URL {url}: {str(e)}" - - -def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None): - title = custom_article_title.strip() if custom_article_title else "Unstructured Text" - author = "Unknown" - ingestion_date = datetime.now().strftime('%Y-%m-%d') - - # Summarize the unstructured text - if api_name: - json_file_path = f"Results/{title.replace(' ', '_')}_segments.json" - with open(json_file_path, 'w') as json_file: - json.dump([{'text': text}], json_file, indent=2) - - if api_name.lower() == 'openai': - summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message) - # Add other APIs as needed - else: - summary = "Unsupported API." - else: - summary = "No API provided for summarization." - - # Ingest the unstructured text into the database - ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date, - custom_prompt) - return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}" - - - -# -# +# Article_Summarization_Lib.py +######################################### +# Article Summarization Library +# This library is used to handle summarization of articles. + +# +#### +# +#################### +# Function List +# +# 1. +# +#################### +# +# Import necessary libraries +import datetime +from datetime import datetime +import gradio as gr +import json +import os +import logging +import requests +# 3rd-Party Imports +from tqdm import tqdm + +from App_Function_Libraries.Utils import sanitize_filename +# Local Imports +from Article_Extractor_Lib import scrape_article +from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \ + summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm +from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \ + summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \ + summarize_with_mistral +from App_Function_Libraries.DB_Manager import ingest_article_to_db +# +####################################################################################################################### +# Function Definitions +# + +def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None): + urls = [url.strip() for url in urls.split('\n') if url.strip()] + custom_titles = custom_article_titles.split('\n') if custom_article_titles else [] + + results = [] + errors = [] + + # Create a progress bar + progress = gr.Progress() + + for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"): + custom_title = custom_titles[i] if i < len(custom_titles) else None + try: + result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title, system_message) + results.append(f"Results for URL {i + 1}:\n{result}") + except Exception as e: + error_message = f"Error processing URL {i + 1} ({url}): {str(e)}" + errors.append(error_message) + results.append(f"Failed to process URL {i + 1}: {url}") + + # Update progress + progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs") + + # Combine results and errors + combined_output = "\n".join(results) + if errors: + combined_output += "\n\nErrors encountered:\n" + "\n".join(errors) + + return combined_output + + +def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None): + try: + # Step 1: Scrape the article + article_data = scrape_article(url) + print(f"Scraped Article Data: {article_data}") # Debugging statement + if not article_data: + return "Failed to scrape the article." + + # Use the custom title if provided, otherwise use the scraped title + title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled') + author = article_data.get('author', 'Unknown') + content = article_data.get('content', '') + ingestion_date = datetime.now().strftime('%Y-%m-%d') + + print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement + + # Custom system prompt for the article + system_message = system_message or "Act as a professional summarizer and summarize this article." + # Custom prompt for the article + article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article." + + # Step 2: Summarize the article + summary = None + if api_name: + logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}") + + # Sanitize filename for saving the JSON file + sanitized_title = sanitize_filename(title) + json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json") + + with open(json_file_path, 'w') as json_file: + json.dump([{'text': content}], json_file, indent=2) + + # FIXME - Swap out this if/else to use the dedicated function.... + try: + if api_name.lower() == 'openai': + # def summarize_with_openai(api_key, input_data, custom_prompt_arg) + summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "anthropic": + # def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5): + summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message) + elif api_name.lower() == "cohere": + # def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg) + summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "groq": + logging.debug(f"MAIN: Trying to summarize with groq") + # def summarize_with_groq(api_key, input_data, model, custom_prompt_arg): + summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "openrouter": + logging.debug(f"MAIN: Trying to summarize with OpenRouter") + # def summarize_with_openrouter(api_key, input_data, custom_prompt_arg): + summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "deepseek": + logging.debug(f"MAIN: Trying to summarize with DeepSeek") + # def summarize_with_deepseek(api_key, input_data, custom_prompt_arg): + summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "mistral": + summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "llama.cpp": + logging.debug(f"MAIN: Trying to summarize with Llama.cpp") + # def summarize_with_llama(api_url, file_path, token, custom_prompt) + summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "kobold": + logging.debug(f"MAIN: Trying to summarize with Kobold.cpp") + # def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url): + summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message) + + elif api_name.lower() == "ooba": + # def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url): + summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message) + + elif api_name.lower() == "tabbyapi": + # def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP): + summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "vllm": + logging.debug(f"MAIN: Trying to summarize with VLLM") + # def summarize_with_vllm(api_key, input_data, custom_prompt_input): + summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "local-llm": + logging.debug(f"MAIN: Trying to summarize with Local LLM") + summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message) + + elif api_name.lower() == "huggingface": + logging.debug(f"MAIN: Trying to summarize with huggingface") + # def summarize_with_huggingface(api_key, input_data, custom_prompt_arg): + summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message) + # Add additional API handlers here... + except requests.exceptions.ConnectionError as e: + logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}") + + if summary: + logging.info(f"Article_Summarizer: Summary generated using {api_name} API") + save_summary_to_file(summary, json_file_path) + else: + summary = "Summary not available" + logging.warning(f"Failed to generate summary using {api_name} API") + + else: + summary = "Article Summarization: No API provided for summarization." + + print(f"Summary: {summary}") # Debugging statement + + # Step 3: Ingest the article into the database + ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, + article_custom_prompt) + + return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}" + except Exception as e: + logging.error(f"Error processing URL {url}: {str(e)}") + return f"Failed to process URL {url}: {str(e)}" + + +def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None): + title = custom_article_title.strip() if custom_article_title else "Unstructured Text" + author = "Unknown" + ingestion_date = datetime.now().strftime('%Y-%m-%d') + + # Summarize the unstructured text + if api_name: + json_file_path = f"Results/{title.replace(' ', '_')}_segments.json" + with open(json_file_path, 'w') as json_file: + json.dump([{'text': text}], json_file, indent=2) + + if api_name.lower() == 'openai': + summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message) + # Add other APIs as needed + else: + summary = "Unsupported API." + else: + summary = "No API provided for summarization." + + # Ingest the unstructured text into the database + ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date, + custom_prompt) + return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}" + + + +# +# ####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/Audio_Files.py b/App_Function_Libraries/Audio_Files.py index 63213e38c402ae8231812e86bf157ef96b1b4529..a1c3fb1cad57e208b919b12bd39424047fe87e36 100644 --- a/App_Function_Libraries/Audio_Files.py +++ b/App_Function_Libraries/Audio_Files.py @@ -1,692 +1,692 @@ -# Audio_Files.py -######################################### -# Audio Processing Library -# This library is used to download or load audio files from a local directory. -# -#### -# -# Functions: -# -# download_audio_file(url, save_path) -# process_audio( -# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None) -# -# -######################################### -# Imports -import json -import logging -import os -import subprocess -import tempfile -import uuid -from datetime import datetime -from pathlib import Path - -import requests -import yt_dlp - -from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text -from App_Function_Libraries.Chunk_Lib import improved_chunking_process -# -# Local Imports -from App_Function_Libraries.SQLite_DB import add_media_to_database, add_media_with_keywords, \ - check_media_and_whisper_model -from App_Function_Libraries.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \ - perform_summarization -from App_Function_Libraries.Utils import create_download_directory, save_segments_to_json, downloaded_files, \ - sanitize_filename -from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata - -# -####################################################################################################################### -# Function Definitions -# - -MAX_FILE_SIZE = 500 * 1024 * 1024 - - -def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None): - try: - # Check if media already exists in the database and compare whisper models - should_download, reason = check_media_and_whisper_model( - url=url, - current_whisper_model=current_whisper_model - ) - - if not should_download: - logging.info(f"Skipping audio download: {reason}") - return None - - logging.info(f"Proceeding with audio download: {reason}") - - # Set up the request headers - headers = {} - if use_cookies and cookies: - try: - cookie_dict = json.loads(cookies) - headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()]) - except json.JSONDecodeError: - logging.warning("Invalid cookie format. Proceeding without cookies.") - - # Make the request - response = requests.get(url, headers=headers, stream=True) - # Raise an exception for bad status codes - response.raise_for_status() - - # Get the file size - file_size = int(response.headers.get('content-length', 0)) - if file_size > 500 * 1024 * 1024: # 500 MB limit - raise ValueError("File size exceeds the 500MB limit.") - - # Generate a unique filename - file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3" - save_path = os.path.join('downloads', file_name) - - # Ensure the downloads directory exists - os.makedirs('downloads', exist_ok=True) - - - # Download the file - with open(save_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - logging.info(f"Audio file downloaded successfully: {save_path}") - return save_path - - except requests.RequestException as e: - logging.error(f"Error downloading audio file: {str(e)}") - raise - except ValueError as e: - logging.error(str(e)) - raise - except Exception as e: - logging.error(f"Unexpected error downloading audio file: {str(e)}") - raise - - -def process_audio( - audio_file_path, - num_speakers=2, - whisper_model="small.en", - custom_prompt_input=None, - offset=0, - api_name=None, - api_key=None, - vad_filter=False, - rolling_summarization=False, - detail_level=0.01, - keywords="default,no_keyword_set", - chunk_text_by_words=False, - max_words=0, - chunk_text_by_sentences=False, - max_sentences=0, - chunk_text_by_paragraphs=False, - max_paragraphs=0, - chunk_text_by_tokens=False, - max_tokens=0 -): - try: - - # Perform transcription - audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter) - - if audio_file_path is None or segments is None: - logging.error("Process_Audio: Transcription failed or segments not available.") - return "Process_Audio: Transcription failed.", None, None, None, None, None - - logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}") - logging.debug(f"Process_Audio: Transcription segments: {segments}") - - transcription_text = {'audio_file': audio_file_path, 'transcription': segments} - logging.debug(f"Process_Audio: Transcription text: {transcription_text}") - - # Save segments to JSON - segments_json_path = save_segments_to_json(segments) - - # Perform summarization - summary_text = None - if api_name: - if rolling_summarization is not None: - pass - # FIXME rolling summarization - # summary_text = rolling_summarize_function( - # transcription_text, - # detail=detail_level, - # api_name=api_name, - # api_key=api_key, - # custom_prompt=custom_prompt_input, - # chunk_by_words=chunk_text_by_words, - # max_words=max_words, - # chunk_by_sentences=chunk_text_by_sentences, - # max_sentences=max_sentences, - # chunk_by_paragraphs=chunk_text_by_paragraphs, - # max_paragraphs=max_paragraphs, - # chunk_by_tokens=chunk_text_by_tokens, - # max_tokens=max_tokens - # ) - else: - summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key) - - if summary_text is None: - logging.error("Summary text is None. Check summarization function.") - summary_file_path = None - else: - summary_text = 'Summary not available' - summary_file_path = None - - # Save transcription and summary - download_path = create_download_directory("Audio_Processing") - json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text, - download_path) - - # Update function call to add_media_to_database so that it properly applies the title, author and file type - # Add to database - add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords, - custom_prompt_input, whisper_model) - - return transcription_text, summary_text, json_file_path, summary_file_path, None, None - - except Exception as e: - logging.error(f"Error in process_audio: {str(e)}") - return str(e), None, None, None, None, None - - -def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source, - custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking, - use_multi_level_chunking, chunk_language): - progress = [] - transcription = "" - summary = "" - - def update_progress(message): - progress.append(message) - return "\n".join(progress) - - try: - # Check file size before processing - file_size = os.path.getsize(audio_file_path) - if file_size > MAX_FILE_SIZE: - update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.") - return "\n".join(progress), "", "" - - # Perform transcription - update_progress("Starting transcription...") - segments = speech_to_text(audio_file_path, whisper_model=whisper_model) - transcription = " ".join([segment['Text'] for segment in segments]) - update_progress("Audio transcribed successfully.") - - # Perform summarization if API is provided - if api_name and api_key: - update_progress("Starting summarization...") - summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript", - api_key) - update_progress("Audio summarized successfully.") - else: - summary = "No summary available" - - # Prepare keywords - keywords = "audio,transcription" - if custom_keywords: - keywords += f",{custom_keywords}" - - # Add to database - add_media_with_keywords( - url=source, - title=os.path.basename(audio_file_path), - media_type='audio', - content=transcription, - keywords=keywords, - prompt="Summarize the following audio transcript", - summary=summary, - transcription_model=whisper_model, - author="Unknown", - ingestion_date=None # This will use the current date - ) - update_progress("Audio file added to database successfully.") - - if not keep_original and source != "Uploaded File": - os.remove(audio_file_path) - update_progress(f"Temporary file {audio_file_path} removed.") - elif keep_original and source != "Uploaded File": - update_progress(f"Original audio file kept at: {audio_file_path}") - - except Exception as e: - update_progress(f"Error processing {source}: {str(e)}") - transcription = f"Error: {str(e)}" - summary = "No summary due to error" - - return "\n".join(progress), transcription, summary - - -def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original, - custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, - use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize): - progress = [] - temp_files = [] - all_transcriptions = [] - all_summaries = [] - - def update_progress(message): - progress.append(message) - return "\n".join(progress) - - def cleanup_files(): - for file in temp_files: - try: - if os.path.exists(file): - os.remove(file) - update_progress(f"Temporary file {file} removed.") - except Exception as e: - update_progress(f"Failed to remove temporary file {file}: {str(e)}") - - def reencode_mp3(mp3_file_path): - try: - reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3") - subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True) - update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.") - return reencoded_mp3_path - except subprocess.CalledProcessError as e: - update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}") - raise - - def convert_mp3_to_wav(mp3_file_path): - try: - wav_file_path = mp3_file_path.replace(".mp3", ".wav") - subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True) - update_progress(f"Converted {mp3_file_path} to {wav_file_path}.") - return wav_file_path - except subprocess.CalledProcessError as e: - update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}") - raise - - try: - # Check and set the ffmpeg command - global ffmpeg_cmd - if os.name == "nt": - logging.debug("Running on Windows") - ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe") - else: - ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems - - # Ensure ffmpeg is accessible - if not os.path.exists(ffmpeg_cmd) and os.name == "nt": - raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}") - - # Define chunk options early to avoid undefined errors - chunk_options = { - 'method': chunk_method, - 'max_size': max_chunk_size, - 'overlap': chunk_overlap, - 'adaptive': use_adaptive_chunking, - 'multi_level': use_multi_level_chunking, - 'language': chunk_language - } - - # Process multiple URLs - urls = [url.strip() for url in audio_urls.split('\n') if url.strip()] - - for i, url in enumerate(urls): - update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}") - - # Download and process audio file - audio_file_path = download_audio_file(url, use_cookies, cookies) - if not os.path.exists(audio_file_path): - update_progress(f"Downloaded file not found: {audio_file_path}") - continue - - temp_files.append(audio_file_path) - update_progress("Audio file downloaded successfully.") - - # Re-encode MP3 to fix potential issues - reencoded_mp3_path = reencode_mp3(audio_file_path) - if not os.path.exists(reencoded_mp3_path): - update_progress(f"Re-encoded file not found: {reencoded_mp3_path}") - continue - - temp_files.append(reencoded_mp3_path) - - # Convert re-encoded MP3 to WAV - wav_file_path = convert_mp3_to_wav(reencoded_mp3_path) - if not os.path.exists(wav_file_path): - update_progress(f"Converted WAV file not found: {wav_file_path}") - continue - - temp_files.append(wav_file_path) - - # Initialize transcription - transcription = "" - - # Transcribe audio - if diarize: - segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True) - else: - segments = speech_to_text(wav_file_path, whisper_model=whisper_model) - - # Handle segments nested under 'segments' key - if isinstance(segments, dict) and 'segments' in segments: - segments = segments['segments'] - - if isinstance(segments, list): - transcription = " ".join([segment.get('Text', '') for segment in segments]) - update_progress("Audio transcribed successfully.") - else: - update_progress("Unexpected segments format received from speech_to_text.") - logging.error(f"Unexpected segments format: {segments}") - continue - - if not transcription.strip(): - update_progress("Transcription is empty.") - else: - # Apply chunking - chunked_text = improved_chunking_process(transcription, chunk_options) - - # Summarize - if api_name: - try: - summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key) - update_progress("Audio summarized successfully.") - except Exception as e: - logging.error(f"Error during summarization: {str(e)}") - summary = "Summary generation failed" - else: - summary = "No summary available (API not provided)" - - all_transcriptions.append(transcription) - all_summaries.append(summary) - - # Add to database - add_media_with_keywords( - url=url, - title=os.path.basename(wav_file_path), - media_type='audio', - content=transcription, - keywords=custom_keywords, - prompt=custom_prompt_input, - summary=summary, - transcription_model=whisper_model, - author="Unknown", - ingestion_date=datetime.now().strftime('%Y-%m-%d') - ) - update_progress("Audio file processed and added to database.") - - # Process uploaded file if provided - if audio_file: - if os.path.getsize(audio_file.name) > MAX_FILE_SIZE: - update_progress( - f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.") - else: - # Re-encode MP3 to fix potential issues - reencoded_mp3_path = reencode_mp3(audio_file.name) - if not os.path.exists(reencoded_mp3_path): - update_progress(f"Re-encoded file not found: {reencoded_mp3_path}") - return update_progress("Processing failed: Re-encoded file not found"), "", "" - - temp_files.append(reencoded_mp3_path) - - # Convert re-encoded MP3 to WAV - wav_file_path = convert_mp3_to_wav(reencoded_mp3_path) - if not os.path.exists(wav_file_path): - update_progress(f"Converted WAV file not found: {wav_file_path}") - return update_progress("Processing failed: Converted WAV file not found"), "", "" - - temp_files.append(wav_file_path) - - # Initialize transcription - transcription = "" - - if diarize: - segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True) - else: - segments = speech_to_text(wav_file_path, whisper_model=whisper_model) - - # Handle segments nested under 'segments' key - if isinstance(segments, dict) and 'segments' in segments: - segments = segments['segments'] - - if isinstance(segments, list): - transcription = " ".join([segment.get('Text', '') for segment in segments]) - else: - update_progress("Unexpected segments format received from speech_to_text.") - logging.error(f"Unexpected segments format: {segments}") - - chunked_text = improved_chunking_process(transcription, chunk_options) - - if api_name and api_key: - try: - summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key) - update_progress("Audio summarized successfully.") - except Exception as e: - logging.error(f"Error during summarization: {str(e)}") - summary = "Summary generation failed" - else: - summary = "No summary available (API not provided)" - - all_transcriptions.append(transcription) - all_summaries.append(summary) - - add_media_with_keywords( - url="Uploaded File", - title=os.path.basename(wav_file_path), - media_type='audio', - content=transcription, - keywords=custom_keywords, - prompt=custom_prompt_input, - summary=summary, - transcription_model=whisper_model, - author="Unknown", - ingestion_date=datetime.now().strftime('%Y-%m-%d') - ) - update_progress("Uploaded file processed and added to database.") - - # Final cleanup - if not keep_original: - cleanup_files() - - final_progress = update_progress("All processing complete.") - final_transcriptions = "\n\n".join(all_transcriptions) - final_summaries = "\n\n".join(all_summaries) - - return final_progress, final_transcriptions, final_summaries - - except Exception as e: - logging.error(f"Error processing audio files: {str(e)}") - cleanup_files() - return update_progress(f"Processing failed: {str(e)}"), "", "" - - -def download_youtube_audio(url): - try: - # Determine ffmpeg path based on the operating system. - ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg' - - # Create a temporary directory - with tempfile.TemporaryDirectory() as temp_dir: - # Extract information about the video - with yt_dlp.YoutubeDL({'quiet': True}) as ydl: - info_dict = ydl.extract_info(url, download=False) - sanitized_title = sanitize_filename(info_dict['title']) - - # Setup the temporary filenames - temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4" - temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3" - - # Initialize yt-dlp with options for downloading - ydl_opts = { - 'format': 'bestaudio[ext=m4a]/best[height<=480]', # Prefer best audio, or video up to 480p - 'ffmpeg_location': ffmpeg_path, - 'outtmpl': str(temp_video_path), - 'noplaylist': True, - 'quiet': True - } - - # Execute yt-dlp to download the video/audio - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - ydl.download([url]) - - # Check if the file exists - if not temp_video_path.exists(): - raise FileNotFoundError(f"Expected file was not found: {temp_video_path}") - - # Use ffmpeg to extract audio - ffmpeg_command = [ - ffmpeg_path, - '-i', str(temp_video_path), - '-vn', # No video - '-acodec', 'libmp3lame', - '-b:a', '192k', - str(temp_audio_path) - ] - subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - - # Check if the audio file was created - if not temp_audio_path.exists(): - raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}") - - # Create a persistent directory for the download if it doesn't exist - persistent_dir = Path("downloads") - persistent_dir.mkdir(exist_ok=True) - - # Move the file from the temporary directory to the persistent directory - persistent_file_path = persistent_dir / f"{sanitized_title}.mp3" - os.replace(str(temp_audio_path), str(persistent_file_path)) - - # Add the file to the list of downloaded files - downloaded_files.append(str(persistent_file_path)) - - return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3" - except Exception as e: - return None, f"Error downloading audio: {str(e)}" - - -def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model, - keep_original=False, enable_diarization=False, use_cookies=False, cookies=None, - chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False, - use_multi_level_chunking=False, chunk_language='english'): - progress = [] - error_message = "" - temp_files = [] - - def update_progress(message): - progress.append(message) - return "\n".join(progress) - - def cleanup_files(): - if not keep_original: - for file in temp_files: - try: - if os.path.exists(file): - os.remove(file) - update_progress(f"Temporary file {file} removed.") - except Exception as e: - update_progress(f"Failed to remove temporary file {file}: {str(e)}") - - try: - # Download podcast - audio_file = download_audio_file(url, use_cookies, cookies) - temp_files.append(audio_file) - update_progress("Podcast downloaded successfully.") - - # Extract metadata - metadata = extract_metadata(url) - title = title or metadata.get('title', 'Unknown Podcast') - author = author or metadata.get('uploader', 'Unknown Author') - - # Format metadata for storage - metadata_text = f""" -Metadata: -Title: {title} -Author: {author} -Series: {metadata.get('series', 'N/A')} -Episode: {metadata.get('episode', 'N/A')} -Season: {metadata.get('season', 'N/A')} -Upload Date: {metadata.get('upload_date', 'N/A')} -Duration: {metadata.get('duration', 'N/A')} seconds -Description: {metadata.get('description', 'N/A')} -""" - - # Update keywords - new_keywords = [] - if metadata.get('series'): - new_keywords.append(f"series:{metadata['series']}") - if metadata.get('episode'): - new_keywords.append(f"episode:{metadata['episode']}") - if metadata.get('season'): - new_keywords.append(f"season:{metadata['season']}") - - keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords) - - update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}") - - # Transcribe the podcast - try: - if enable_diarization: - segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True) - else: - segments = speech_to_text(audio_file, whisper_model=whisper_model) - transcription = " ".join([segment['Text'] for segment in segments]) - update_progress("Podcast transcribed successfully.") - except Exception as e: - error_message = f"Transcription failed: {str(e)}" - raise - - # Apply chunking - chunk_options = { - 'method': chunk_method, - 'max_size': max_chunk_size, - 'overlap': chunk_overlap, - 'adaptive': use_adaptive_chunking, - 'multi_level': use_multi_level_chunking, - 'language': chunk_language - } - chunked_text = improved_chunking_process(transcription, chunk_options) - - # Combine metadata and transcription - full_content = metadata_text + "\n\nTranscription:\n" + transcription - - # Summarize if API is provided - summary = None - if api_name and api_key: - try: - summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key) - update_progress("Podcast summarized successfully.") - except Exception as e: - error_message = f"Summarization failed: {str(e)}" - raise - - # Add to database - try: - add_media_with_keywords( - url=url, - title=title, - media_type='podcast', - content=full_content, - keywords=keywords, - prompt=custom_prompt, - summary=summary or "No summary available", - transcription_model=whisper_model, - author=author, - ingestion_date=datetime.now().strftime('%Y-%m-%d') - ) - update_progress("Podcast added to database successfully.") - except Exception as e: - error_message = f"Error adding podcast to database: {str(e)}" - raise - - # Cleanup - cleanup_files() - - return (update_progress("Processing complete."), full_content, summary or "No summary generated.", - title, author, keywords, error_message) - - except Exception as e: - logging.error(f"Error processing podcast: {str(e)}") - cleanup_files() - return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e) - - -# -# +# Audio_Files.py +######################################### +# Audio Processing Library +# This library is used to download or load audio files from a local directory. +# +#### +# +# Functions: +# +# download_audio_file(url, save_path) +# process_audio( +# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None) +# +# +######################################### +# Imports +import json +import logging +import os +import subprocess +import tempfile +import uuid +from datetime import datetime +from pathlib import Path + +import requests +import yt_dlp + +from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text +from App_Function_Libraries.Chunk_Lib import improved_chunking_process +# +# Local Imports +from App_Function_Libraries.DB_Manager import add_media_to_database, add_media_with_keywords, \ + check_media_and_whisper_model +from App_Function_Libraries.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \ + perform_summarization +from App_Function_Libraries.Utils import create_download_directory, save_segments_to_json, downloaded_files, \ + sanitize_filename +from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata + +# +####################################################################################################################### +# Function Definitions +# + +MAX_FILE_SIZE = 500 * 1024 * 1024 + + +def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None): + try: + # Check if media already exists in the database and compare whisper models + should_download, reason = check_media_and_whisper_model( + url=url, + current_whisper_model=current_whisper_model + ) + + if not should_download: + logging.info(f"Skipping audio download: {reason}") + return None + + logging.info(f"Proceeding with audio download: {reason}") + + # Set up the request headers + headers = {} + if use_cookies and cookies: + try: + cookie_dict = json.loads(cookies) + headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()]) + except json.JSONDecodeError: + logging.warning("Invalid cookie format. Proceeding without cookies.") + + # Make the request + response = requests.get(url, headers=headers, stream=True) + # Raise an exception for bad status codes + response.raise_for_status() + + # Get the file size + file_size = int(response.headers.get('content-length', 0)) + if file_size > 500 * 1024 * 1024: # 500 MB limit + raise ValueError("File size exceeds the 500MB limit.") + + # Generate a unique filename + file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3" + save_path = os.path.join('downloads', file_name) + + # Ensure the downloads directory exists + os.makedirs('downloads', exist_ok=True) + + + # Download the file + with open(save_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + logging.info(f"Audio file downloaded successfully: {save_path}") + return save_path + + except requests.RequestException as e: + logging.error(f"Error downloading audio file: {str(e)}") + raise + except ValueError as e: + logging.error(str(e)) + raise + except Exception as e: + logging.error(f"Unexpected error downloading audio file: {str(e)}") + raise + + +def process_audio( + audio_file_path, + num_speakers=2, + whisper_model="small.en", + custom_prompt_input=None, + offset=0, + api_name=None, + api_key=None, + vad_filter=False, + rolling_summarization=False, + detail_level=0.01, + keywords="default,no_keyword_set", + chunk_text_by_words=False, + max_words=0, + chunk_text_by_sentences=False, + max_sentences=0, + chunk_text_by_paragraphs=False, + max_paragraphs=0, + chunk_text_by_tokens=False, + max_tokens=0 +): + try: + + # Perform transcription + audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter) + + if audio_file_path is None or segments is None: + logging.error("Process_Audio: Transcription failed or segments not available.") + return "Process_Audio: Transcription failed.", None, None, None, None, None + + logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}") + logging.debug(f"Process_Audio: Transcription segments: {segments}") + + transcription_text = {'audio_file': audio_file_path, 'transcription': segments} + logging.debug(f"Process_Audio: Transcription text: {transcription_text}") + + # Save segments to JSON + segments_json_path = save_segments_to_json(segments) + + # Perform summarization + summary_text = None + if api_name: + if rolling_summarization is not None: + pass + # FIXME rolling summarization + # summary_text = rolling_summarize_function( + # transcription_text, + # detail=detail_level, + # api_name=api_name, + # api_key=api_key, + # custom_prompt=custom_prompt_input, + # chunk_by_words=chunk_text_by_words, + # max_words=max_words, + # chunk_by_sentences=chunk_text_by_sentences, + # max_sentences=max_sentences, + # chunk_by_paragraphs=chunk_text_by_paragraphs, + # max_paragraphs=max_paragraphs, + # chunk_by_tokens=chunk_text_by_tokens, + # max_tokens=max_tokens + # ) + else: + summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key) + + if summary_text is None: + logging.error("Summary text is None. Check summarization function.") + summary_file_path = None + else: + summary_text = 'Summary not available' + summary_file_path = None + + # Save transcription and summary + download_path = create_download_directory("Audio_Processing") + json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text, + download_path) + + # Update function call to add_media_to_database so that it properly applies the title, author and file type + # Add to database + add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords, + custom_prompt_input, whisper_model) + + return transcription_text, summary_text, json_file_path, summary_file_path, None, None + + except Exception as e: + logging.error(f"Error in process_audio: {str(e)}") + return str(e), None, None, None, None, None + + +def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source, + custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking, + use_multi_level_chunking, chunk_language): + progress = [] + transcription = "" + summary = "" + + def update_progress(message): + progress.append(message) + return "\n".join(progress) + + try: + # Check file size before processing + file_size = os.path.getsize(audio_file_path) + if file_size > MAX_FILE_SIZE: + update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.") + return "\n".join(progress), "", "" + + # Perform transcription + update_progress("Starting transcription...") + segments = speech_to_text(audio_file_path, whisper_model=whisper_model) + transcription = " ".join([segment['Text'] for segment in segments]) + update_progress("Audio transcribed successfully.") + + # Perform summarization if API is provided + if api_name and api_key: + update_progress("Starting summarization...") + summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript", + api_key) + update_progress("Audio summarized successfully.") + else: + summary = "No summary available" + + # Prepare keywords + keywords = "audio,transcription" + if custom_keywords: + keywords += f",{custom_keywords}" + + # Add to database + add_media_with_keywords( + url=source, + title=os.path.basename(audio_file_path), + media_type='audio', + content=transcription, + keywords=keywords, + prompt="Summarize the following audio transcript", + summary=summary, + transcription_model=whisper_model, + author="Unknown", + ingestion_date=None # This will use the current date + ) + update_progress("Audio file added to database successfully.") + + if not keep_original and source != "Uploaded File": + os.remove(audio_file_path) + update_progress(f"Temporary file {audio_file_path} removed.") + elif keep_original and source != "Uploaded File": + update_progress(f"Original audio file kept at: {audio_file_path}") + + except Exception as e: + update_progress(f"Error processing {source}: {str(e)}") + transcription = f"Error: {str(e)}" + summary = "No summary due to error" + + return "\n".join(progress), transcription, summary + + +def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original, + custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, + use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize): + progress = [] + temp_files = [] + all_transcriptions = [] + all_summaries = [] + + def update_progress(message): + progress.append(message) + return "\n".join(progress) + + def cleanup_files(): + for file in temp_files: + try: + if os.path.exists(file): + os.remove(file) + update_progress(f"Temporary file {file} removed.") + except Exception as e: + update_progress(f"Failed to remove temporary file {file}: {str(e)}") + + def reencode_mp3(mp3_file_path): + try: + reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3") + subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True) + update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.") + return reencoded_mp3_path + except subprocess.CalledProcessError as e: + update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}") + raise + + def convert_mp3_to_wav(mp3_file_path): + try: + wav_file_path = mp3_file_path.replace(".mp3", ".wav") + subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True) + update_progress(f"Converted {mp3_file_path} to {wav_file_path}.") + return wav_file_path + except subprocess.CalledProcessError as e: + update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}") + raise + + try: + # Check and set the ffmpeg command + global ffmpeg_cmd + if os.name == "nt": + logging.debug("Running on Windows") + ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe") + else: + ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems + + # Ensure ffmpeg is accessible + if not os.path.exists(ffmpeg_cmd) and os.name == "nt": + raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}") + + # Define chunk options early to avoid undefined errors + chunk_options = { + 'method': chunk_method, + 'max_size': max_chunk_size, + 'overlap': chunk_overlap, + 'adaptive': use_adaptive_chunking, + 'multi_level': use_multi_level_chunking, + 'language': chunk_language + } + + # Process multiple URLs + urls = [url.strip() for url in audio_urls.split('\n') if url.strip()] + + for i, url in enumerate(urls): + update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}") + + # Download and process audio file + audio_file_path = download_audio_file(url, use_cookies, cookies) + if not os.path.exists(audio_file_path): + update_progress(f"Downloaded file not found: {audio_file_path}") + continue + + temp_files.append(audio_file_path) + update_progress("Audio file downloaded successfully.") + + # Re-encode MP3 to fix potential issues + reencoded_mp3_path = reencode_mp3(audio_file_path) + if not os.path.exists(reencoded_mp3_path): + update_progress(f"Re-encoded file not found: {reencoded_mp3_path}") + continue + + temp_files.append(reencoded_mp3_path) + + # Convert re-encoded MP3 to WAV + wav_file_path = convert_mp3_to_wav(reencoded_mp3_path) + if not os.path.exists(wav_file_path): + update_progress(f"Converted WAV file not found: {wav_file_path}") + continue + + temp_files.append(wav_file_path) + + # Initialize transcription + transcription = "" + + # Transcribe audio + if diarize: + segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True) + else: + segments = speech_to_text(wav_file_path, whisper_model=whisper_model) + + # Handle segments nested under 'segments' key + if isinstance(segments, dict) and 'segments' in segments: + segments = segments['segments'] + + if isinstance(segments, list): + transcription = " ".join([segment.get('Text', '') for segment in segments]) + update_progress("Audio transcribed successfully.") + else: + update_progress("Unexpected segments format received from speech_to_text.") + logging.error(f"Unexpected segments format: {segments}") + continue + + if not transcription.strip(): + update_progress("Transcription is empty.") + else: + # Apply chunking + chunked_text = improved_chunking_process(transcription, chunk_options) + + # Summarize + if api_name: + try: + summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key) + update_progress("Audio summarized successfully.") + except Exception as e: + logging.error(f"Error during summarization: {str(e)}") + summary = "Summary generation failed" + else: + summary = "No summary available (API not provided)" + + all_transcriptions.append(transcription) + all_summaries.append(summary) + + # Add to database + add_media_with_keywords( + url=url, + title=os.path.basename(wav_file_path), + media_type='audio', + content=transcription, + keywords=custom_keywords, + prompt=custom_prompt_input, + summary=summary, + transcription_model=whisper_model, + author="Unknown", + ingestion_date=datetime.now().strftime('%Y-%m-%d') + ) + update_progress("Audio file processed and added to database.") + + # Process uploaded file if provided + if audio_file: + if os.path.getsize(audio_file.name) > MAX_FILE_SIZE: + update_progress( + f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.") + else: + # Re-encode MP3 to fix potential issues + reencoded_mp3_path = reencode_mp3(audio_file.name) + if not os.path.exists(reencoded_mp3_path): + update_progress(f"Re-encoded file not found: {reencoded_mp3_path}") + return update_progress("Processing failed: Re-encoded file not found"), "", "" + + temp_files.append(reencoded_mp3_path) + + # Convert re-encoded MP3 to WAV + wav_file_path = convert_mp3_to_wav(reencoded_mp3_path) + if not os.path.exists(wav_file_path): + update_progress(f"Converted WAV file not found: {wav_file_path}") + return update_progress("Processing failed: Converted WAV file not found"), "", "" + + temp_files.append(wav_file_path) + + # Initialize transcription + transcription = "" + + if diarize: + segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True) + else: + segments = speech_to_text(wav_file_path, whisper_model=whisper_model) + + # Handle segments nested under 'segments' key + if isinstance(segments, dict) and 'segments' in segments: + segments = segments['segments'] + + if isinstance(segments, list): + transcription = " ".join([segment.get('Text', '') for segment in segments]) + else: + update_progress("Unexpected segments format received from speech_to_text.") + logging.error(f"Unexpected segments format: {segments}") + + chunked_text = improved_chunking_process(transcription, chunk_options) + + if api_name and api_key: + try: + summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key) + update_progress("Audio summarized successfully.") + except Exception as e: + logging.error(f"Error during summarization: {str(e)}") + summary = "Summary generation failed" + else: + summary = "No summary available (API not provided)" + + all_transcriptions.append(transcription) + all_summaries.append(summary) + + add_media_with_keywords( + url="Uploaded File", + title=os.path.basename(wav_file_path), + media_type='audio', + content=transcription, + keywords=custom_keywords, + prompt=custom_prompt_input, + summary=summary, + transcription_model=whisper_model, + author="Unknown", + ingestion_date=datetime.now().strftime('%Y-%m-%d') + ) + update_progress("Uploaded file processed and added to database.") + + # Final cleanup + if not keep_original: + cleanup_files() + + final_progress = update_progress("All processing complete.") + final_transcriptions = "\n\n".join(all_transcriptions) + final_summaries = "\n\n".join(all_summaries) + + return final_progress, final_transcriptions, final_summaries + + except Exception as e: + logging.error(f"Error processing audio files: {str(e)}") + cleanup_files() + return update_progress(f"Processing failed: {str(e)}"), "", "" + + +def download_youtube_audio(url): + try: + # Determine ffmpeg path based on the operating system. + ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg' + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # Extract information about the video + with yt_dlp.YoutubeDL({'quiet': True}) as ydl: + info_dict = ydl.extract_info(url, download=False) + sanitized_title = sanitize_filename(info_dict['title']) + + # Setup the temporary filenames + temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4" + temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3" + + # Initialize yt-dlp with options for downloading + ydl_opts = { + 'format': 'bestaudio[ext=m4a]/best[height<=480]', # Prefer best audio, or video up to 480p + 'ffmpeg_location': ffmpeg_path, + 'outtmpl': str(temp_video_path), + 'noplaylist': True, + 'quiet': True + } + + # Execute yt-dlp to download the video/audio + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + + # Check if the file exists + if not temp_video_path.exists(): + raise FileNotFoundError(f"Expected file was not found: {temp_video_path}") + + # Use ffmpeg to extract audio + ffmpeg_command = [ + ffmpeg_path, + '-i', str(temp_video_path), + '-vn', # No video + '-acodec', 'libmp3lame', + '-b:a', '192k', + str(temp_audio_path) + ] + subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + # Check if the audio file was created + if not temp_audio_path.exists(): + raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}") + + # Create a persistent directory for the download if it doesn't exist + persistent_dir = Path("downloads") + persistent_dir.mkdir(exist_ok=True) + + # Move the file from the temporary directory to the persistent directory + persistent_file_path = persistent_dir / f"{sanitized_title}.mp3" + os.replace(str(temp_audio_path), str(persistent_file_path)) + + # Add the file to the list of downloaded files + downloaded_files.append(str(persistent_file_path)) + + return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3" + except Exception as e: + return None, f"Error downloading audio: {str(e)}" + + +def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model, + keep_original=False, enable_diarization=False, use_cookies=False, cookies=None, + chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False, + use_multi_level_chunking=False, chunk_language='english'): + progress = [] + error_message = "" + temp_files = [] + + def update_progress(message): + progress.append(message) + return "\n".join(progress) + + def cleanup_files(): + if not keep_original: + for file in temp_files: + try: + if os.path.exists(file): + os.remove(file) + update_progress(f"Temporary file {file} removed.") + except Exception as e: + update_progress(f"Failed to remove temporary file {file}: {str(e)}") + + try: + # Download podcast + audio_file = download_audio_file(url, use_cookies, cookies) + temp_files.append(audio_file) + update_progress("Podcast downloaded successfully.") + + # Extract metadata + metadata = extract_metadata(url) + title = title or metadata.get('title', 'Unknown Podcast') + author = author or metadata.get('uploader', 'Unknown Author') + + # Format metadata for storage + metadata_text = f""" +Metadata: +Title: {title} +Author: {author} +Series: {metadata.get('series', 'N/A')} +Episode: {metadata.get('episode', 'N/A')} +Season: {metadata.get('season', 'N/A')} +Upload Date: {metadata.get('upload_date', 'N/A')} +Duration: {metadata.get('duration', 'N/A')} seconds +Description: {metadata.get('description', 'N/A')} +""" + + # Update keywords + new_keywords = [] + if metadata.get('series'): + new_keywords.append(f"series:{metadata['series']}") + if metadata.get('episode'): + new_keywords.append(f"episode:{metadata['episode']}") + if metadata.get('season'): + new_keywords.append(f"season:{metadata['season']}") + + keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords) + + update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}") + + # Transcribe the podcast + try: + if enable_diarization: + segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True) + else: + segments = speech_to_text(audio_file, whisper_model=whisper_model) + transcription = " ".join([segment['Text'] for segment in segments]) + update_progress("Podcast transcribed successfully.") + except Exception as e: + error_message = f"Transcription failed: {str(e)}" + raise + + # Apply chunking + chunk_options = { + 'method': chunk_method, + 'max_size': max_chunk_size, + 'overlap': chunk_overlap, + 'adaptive': use_adaptive_chunking, + 'multi_level': use_multi_level_chunking, + 'language': chunk_language + } + chunked_text = improved_chunking_process(transcription, chunk_options) + + # Combine metadata and transcription + full_content = metadata_text + "\n\nTranscription:\n" + transcription + + # Summarize if API is provided + summary = None + if api_name and api_key: + try: + summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key) + update_progress("Podcast summarized successfully.") + except Exception as e: + error_message = f"Summarization failed: {str(e)}" + raise + + # Add to database + try: + add_media_with_keywords( + url=url, + title=title, + media_type='podcast', + content=full_content, + keywords=keywords, + prompt=custom_prompt, + summary=summary or "No summary available", + transcription_model=whisper_model, + author=author, + ingestion_date=datetime.now().strftime('%Y-%m-%d') + ) + update_progress("Podcast added to database successfully.") + except Exception as e: + error_message = f"Error adding podcast to database: {str(e)}" + raise + + # Cleanup + cleanup_files() + + return (update_progress("Processing complete."), full_content, summary or "No summary generated.", + title, author, keywords, error_message) + + except Exception as e: + logging.error(f"Error processing podcast: {str(e)}") + cleanup_files() + return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e) + + +# +# ####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/Audio_Transcription_Lib.py b/App_Function_Libraries/Audio_Transcription_Lib.py index 315b13f7e03e0587c1e1a04bce343ee974ecff20..2b454161698a440856c7104092d7191a8ddf9736 100644 --- a/App_Function_Libraries/Audio_Transcription_Lib.py +++ b/App_Function_Libraries/Audio_Transcription_Lib.py @@ -1,192 +1,192 @@ -# Audio_Transcription_Lib.py -######################################### -# Transcription Library -# This library is used to perform transcription of audio files. -# Currently, uses faster_whisper for transcription. -# -#### -import configparser -#################### -# Function List -# -# 1. convert_to_wav(video_file_path, offset=0, overwrite=False) -# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False) -# -#################### -# -# Import necessary libraries to run solo for testing -import gc -import json -import logging -import os -import sys -import subprocess -import time - -# DEBUG Imports -#from memory_profiler import profile - -# Import Local -# -####################################################################################################################### -# Function Definitions -# - -# Convert video .m4a into .wav using ffmpeg -# ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav" -# https://www.gyan.dev/ffmpeg/builds/ -# - - -whisper_model_instance = None -# Retrieve processing choice from the configuration file -config = configparser.ConfigParser() -config.read('config.txt') -processing_choice = config.get('Processing', 'processing_choice', fallback='cpu') - - -# FIXME: This is a temporary solution. -# This doesn't clear older models, which means potentially a lot of memory is being used... -def get_whisper_model(model_name, device): - global whisper_model_instance - if whisper_model_instance is None: - from faster_whisper import WhisperModel - logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}") - whisper_model_instance = WhisperModel(model_name, device=device) - return whisper_model_instance - - -# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') -#DEBUG -#@profile -def convert_to_wav(video_file_path, offset=0, overwrite=False): - out_path = os.path.splitext(video_file_path)[0] + ".wav" - - if os.path.exists(out_path) and not overwrite: - print(f"File '{out_path}' already exists. Skipping conversion.") - logging.info(f"Skipping conversion as file already exists: {out_path}") - return out_path - print("Starting conversion process of .m4a to .WAV") - out_path = os.path.splitext(video_file_path)[0] + ".wav" - - try: - if os.name == "nt": - logging.debug("ffmpeg being ran on windows") - - if sys.platform.startswith('win'): - ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" - logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}") - else: - ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems - - command = [ - ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists - "-ss", "00:00:00", # Start at the beginning of the video - "-i", video_file_path, - "-ar", "16000", # Audio sample rate - "-ac", "1", # Number of audio channels - "-c:a", "pcm_s16le", # Audio codec - out_path - ] - try: - # Redirect stdin from null device to prevent ffmpeg from waiting for input - with open(os.devnull, 'rb') as null_file: - result = subprocess.run(command, stdin=null_file, text=True, capture_output=True) - if result.returncode == 0: - logging.info("FFmpeg executed successfully") - logging.debug("FFmpeg output: %s", result.stdout) - else: - logging.error("Error in running FFmpeg") - logging.error("FFmpeg stderr: %s", result.stderr) - raise RuntimeError(f"FFmpeg error: {result.stderr}") - except Exception as e: - logging.error("Error occurred - ffmpeg doesn't like windows") - raise RuntimeError("ffmpeg failed") - elif os.name == "posix": - os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') - else: - raise RuntimeError("Unsupported operating system") - logging.info("Conversion to WAV completed: %s", out_path) - except subprocess.CalledProcessError as e: - logging.error("Error executing FFmpeg command: %s", str(e)) - raise RuntimeError("Error converting video file to WAV") - except Exception as e: - logging.error("speech-to-text: Error transcribing audio: %s", str(e)) - return {"error": str(e)} - gc.collect() - return out_path - - -# Transcribe .wav into .segments.json -#DEBUG -#@profile -def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False): - global whisper_model_instance, processing_choice - logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model) - - time_start = time.time() - if audio_file_path is None: - raise ValueError("speech-to-text: No audio file provided") - logging.info("speech-to-text: Audio file path: %s", audio_file_path) - - try: - _, file_ending = os.path.splitext(audio_file_path) - out_file = audio_file_path.replace(file_ending, ".segments.json") - prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json") - if os.path.exists(out_file): - logging.info("speech-to-text: Segments file already exists: %s", out_file) - with open(out_file) as f: - global segments - segments = json.load(f) - return segments - - logging.info('speech-to-text: Starting transcription...') - options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter) - transcribe_options = dict(task="transcribe", **options) - # use function and config at top of file - whisper_model_instance = get_whisper_model(whisper_model, processing_choice) - segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options) - - segments = [] - for segment_chunk in segments_raw: - chunk = { - "Time_Start": segment_chunk.start, - "Time_End": segment_chunk.end, - "Text": segment_chunk.text - } - logging.debug("Segment: %s", chunk) - segments.append(chunk) - - if segments: - segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"] - - if not segments: - raise RuntimeError("No transcription produced. The audio file may be invalid or empty.") - logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start) - - # Save the segments to a JSON file - prettified and non-prettified - # FIXME so this is an optional flag to save either the prettified json file or the normal one - save_json = True - if save_json: - logging.info("speech-to-text: Saving segments to JSON file") - output_data = {'segments': segments} - - logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file) - with open(prettified_out_file, 'w') as f: - json.dump(output_data, f, indent=2) - - logging.info("speech-to-text: Saving JSON to %s", out_file) - with open(out_file, 'w') as f: - json.dump(output_data, f) - - logging.debug(f"speech-to-text: returning {segments[:500]}") - gc.collect() - return segments - - except Exception as e: - logging.error("speech-to-text: Error transcribing audio: %s", str(e)) - raise RuntimeError("speech-to-text: Error transcribing audio") - -# -# +# Audio_Transcription_Lib.py +######################################### +# Transcription Library +# This library is used to perform transcription of audio files. +# Currently, uses faster_whisper for transcription. +# +#### +import configparser +#################### +# Function List +# +# 1. convert_to_wav(video_file_path, offset=0, overwrite=False) +# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False) +# +#################### +# +# Import necessary libraries to run solo for testing +import gc +import json +import logging +import os +import sys +import subprocess +import time + +# DEBUG Imports +#from memory_profiler import profile + +# Import Local +# +####################################################################################################################### +# Function Definitions +# + +# Convert video .m4a into .wav using ffmpeg +# ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav" +# https://www.gyan.dev/ffmpeg/builds/ +# + + +whisper_model_instance = None +# Retrieve processing choice from the configuration file +config = configparser.ConfigParser() +config.read('config.txt') +processing_choice = config.get('Processing', 'processing_choice', fallback='cpu') + + +# FIXME: This is a temporary solution. +# This doesn't clear older models, which means potentially a lot of memory is being used... +def get_whisper_model(model_name, device): + global whisper_model_instance + if whisper_model_instance is None: + from faster_whisper import WhisperModel + logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}") + whisper_model_instance = WhisperModel(model_name, device=device) + return whisper_model_instance + + +# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') +#DEBUG +#@profile +def convert_to_wav(video_file_path, offset=0, overwrite=False): + out_path = os.path.splitext(video_file_path)[0] + ".wav" + + if os.path.exists(out_path) and not overwrite: + print(f"File '{out_path}' already exists. Skipping conversion.") + logging.info(f"Skipping conversion as file already exists: {out_path}") + return out_path + print("Starting conversion process of .m4a to .WAV") + out_path = os.path.splitext(video_file_path)[0] + ".wav" + + try: + if os.name == "nt": + logging.debug("ffmpeg being ran on windows") + + if sys.platform.startswith('win'): + ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" + logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}") + else: + ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems + + command = [ + ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists + "-ss", "00:00:00", # Start at the beginning of the video + "-i", video_file_path, + "-ar", "16000", # Audio sample rate + "-ac", "1", # Number of audio channels + "-c:a", "pcm_s16le", # Audio codec + out_path + ] + try: + # Redirect stdin from null device to prevent ffmpeg from waiting for input + with open(os.devnull, 'rb') as null_file: + result = subprocess.run(command, stdin=null_file, text=True, capture_output=True) + if result.returncode == 0: + logging.info("FFmpeg executed successfully") + logging.debug("FFmpeg output: %s", result.stdout) + else: + logging.error("Error in running FFmpeg") + logging.error("FFmpeg stderr: %s", result.stderr) + raise RuntimeError(f"FFmpeg error: {result.stderr}") + except Exception as e: + logging.error("Error occurred - ffmpeg doesn't like windows") + raise RuntimeError("ffmpeg failed") + elif os.name == "posix": + os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') + else: + raise RuntimeError("Unsupported operating system") + logging.info("Conversion to WAV completed: %s", out_path) + except subprocess.CalledProcessError as e: + logging.error("Error executing FFmpeg command: %s", str(e)) + raise RuntimeError("Error converting video file to WAV") + except Exception as e: + logging.error("speech-to-text: Error transcribing audio: %s", str(e)) + return {"error": str(e)} + gc.collect() + return out_path + + +# Transcribe .wav into .segments.json +#DEBUG +#@profile +def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False): + global whisper_model_instance, processing_choice + logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model) + + time_start = time.time() + if audio_file_path is None: + raise ValueError("speech-to-text: No audio file provided") + logging.info("speech-to-text: Audio file path: %s", audio_file_path) + + try: + _, file_ending = os.path.splitext(audio_file_path) + out_file = audio_file_path.replace(file_ending, ".segments.json") + prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json") + if os.path.exists(out_file): + logging.info("speech-to-text: Segments file already exists: %s", out_file) + with open(out_file) as f: + global segments + segments = json.load(f) + return segments + + logging.info('speech-to-text: Starting transcription...') + options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter) + transcribe_options = dict(task="transcribe", **options) + # use function and config at top of file + whisper_model_instance = get_whisper_model(whisper_model, processing_choice) + segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options) + + segments = [] + for segment_chunk in segments_raw: + chunk = { + "Time_Start": segment_chunk.start, + "Time_End": segment_chunk.end, + "Text": segment_chunk.text + } + logging.debug("Segment: %s", chunk) + segments.append(chunk) + + if segments: + segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"] + + if not segments: + raise RuntimeError("No transcription produced. The audio file may be invalid or empty.") + logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start) + + # Save the segments to a JSON file - prettified and non-prettified + # FIXME so this is an optional flag to save either the prettified json file or the normal one + save_json = True + if save_json: + logging.info("speech-to-text: Saving segments to JSON file") + output_data = {'segments': segments} + + logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file) + with open(prettified_out_file, 'w') as f: + json.dump(output_data, f, indent=2) + + logging.info("speech-to-text: Saving JSON to %s", out_file) + with open(out_file, 'w') as f: + json.dump(output_data, f) + + logging.debug(f"speech-to-text: returning {segments[:500]}") + gc.collect() + return segments + + except Exception as e: + logging.error("speech-to-text: Error transcribing audio: %s", str(e)) + raise RuntimeError("speech-to-text: Error transcribing audio") + +# +# ####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/Chat.py b/App_Function_Libraries/Chat.py new file mode 100644 index 0000000000000000000000000000000000000000..d46a658cbc60dda03976b530243492c6fae4672d --- /dev/null +++ b/App_Function_Libraries/Chat.py @@ -0,0 +1,273 @@ +# Chat.py +# Chat functions for interacting with the LLMs as chatbots + +# Imports +import json +import logging +import os +import re +import tempfile +from datetime import datetime + +from App_Function_Libraries.DB_Manager import get_conversation_name, save_chat_history_to_database +from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \ + chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface, chat_with_vllm +from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \ + chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi +from App_Function_Libraries.SQLite_DB import load_media_content +from App_Function_Libraries.Utils import generate_unique_filename + + +# +# External Imports +# +# Local Imports +# + +#################################################################################################### +def chat(message, history, media_content, selected_parts, api_endpoint, api_key, prompt, temperature, + system_message=None): + try: + logging.info(f"Debug - Chat Function - Message: {message}") + logging.info(f"Debug - Chat Function - Media Content: {media_content}") + logging.info(f"Debug - Chat Function - Selected Parts: {selected_parts}") + logging.info(f"Debug - Chat Function - API Endpoint: {api_endpoint}") + # logging.info(f"Debug - Chat Function - Prompt: {prompt}") + + # Ensure selected_parts is a list + if not isinstance(selected_parts, (list, tuple)): + selected_parts = [selected_parts] if selected_parts else [] + + # logging.debug(f"Debug - Chat Function - Selected Parts (after check): {selected_parts}") + + # Combine the selected parts of the media content + combined_content = "\n\n".join( + [f"{part.capitalize()}: {media_content.get(part, '')}" for part in selected_parts if part in media_content]) + # Print first 500 chars + # logging.debug(f"Debug - Chat Function - Combined Content: {combined_content[:500]}...") + + # Prepare the input for the API + if not history: + input_data = f"{combined_content}\n\nUser: {message}\n" + else: + input_data = f"User: {message}\n" + # Print first 500 chars + # logging.info(f"Debug - Chat Function - Input Data: {input_data[:500]}...") + + if system_message: + print(f"System message: {system_message}") + logging.debug(f"Debug - Chat Function - System Message: {system_message}") + temperature = float(temperature) if temperature else 0.7 + temp = temperature + + logging.debug("Debug - Chat Function - Temperature: {temperature}") + logging.debug(f"Debug - Chat Function - API Key: {api_key[:10]}") + logging.debug(f"Debug - Chat Function - Prompt: {prompt}") + + # Use the existing API request code based on the selected endpoint + logging.info(f"Debug - Chat Function - API Endpoint: {api_endpoint}") + if api_endpoint.lower() == 'openai': + response = chat_with_openai(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "anthropic": + response = chat_with_anthropic(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "cohere": + response = chat_with_cohere(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "groq": + response = chat_with_groq(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "openrouter": + response = chat_with_openrouter(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "deepseek": + response = chat_with_deepseek(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "mistral": + response = chat_with_mistral(api_key, input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "llama.cpp": + response = chat_with_llama(input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "kobold": + response = chat_with_kobold(input_data, api_key, prompt, temp, system_message) + elif api_endpoint.lower() == "ooba": + response = chat_with_oobabooga(input_data, api_key, prompt, temp, system_message) + elif api_endpoint.lower() == "tabbyapi": + response = chat_with_tabbyapi(input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "vllm": + response = chat_with_vllm(input_data, prompt, system_message) + elif api_endpoint.lower() == "local-llm": + response = chat_with_local_llm(input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "huggingface": + response = chat_with_huggingface(api_key, input_data, prompt, temp) # , system_message) + elif api_endpoint.lower() == "ollama": + response = chat_with_ollama(input_data, prompt, temp, system_message) + elif api_endpoint.lower() == "aphrodite": + response = chat_with_aphrodite(input_data, prompt, temp, system_message) + else: + raise ValueError(f"Unsupported API endpoint: {api_endpoint}") + + return response + + except Exception as e: + logging.error(f"Error in chat function: {str(e)}") + return f"An error occurred: {str(e)}" + + +def save_chat_history_to_db_wrapper(chatbot, conversation_id, media_content): + logging.info(f"Attempting to save chat history. Media content type: {type(media_content)}") + try: + # Extract the media_id and media_name from the media_content + media_id = None + media_name = None + if isinstance(media_content, dict): + logging.debug(f"Media content keys: {media_content.keys()}") + if 'content' in media_content: + try: + content = media_content['content'] + if isinstance(content, str): + content_json = json.loads(content) + elif isinstance(content, dict): + content_json = content + else: + raise ValueError(f"Unexpected content type: {type(content)}") + + # Use the webpage_url as the media_id + media_id = content_json.get('webpage_url') + # Use the title as the media_name + media_name = content_json.get('title') + + logging.info(f"Extracted media_id: {media_id}, media_name: {media_name}") + except json.JSONDecodeError: + logging.error("Failed to decode JSON from media_content['content']") + except Exception as e: + logging.error(f"Error processing media_content: {str(e)}") + else: + logging.warning("'content' key not found in media_content") + else: + logging.warning(f"media_content is not a dictionary. Type: {type(media_content)}") + + if media_id is None: + # If we couldn't find a media_id, we'll use a placeholder + media_id = "unknown_media" + logging.warning(f"Unable to extract media_id from media_content. Using placeholder: {media_id}") + + if media_name is None: + media_name = "Unnamed Media" + logging.warning(f"Unable to extract media_name from media_content. Using placeholder: {media_name}") + + # Generate a unique conversation name using media_id and current timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + conversation_name = f"Chat_{media_id}_{timestamp}" + + new_conversation_id = save_chat_history_to_database(chatbot, conversation_id, media_id, media_name, + conversation_name) + return new_conversation_id, f"Chat history saved successfully as {conversation_name}!" + except Exception as e: + error_message = f"Failed to save chat history: {str(e)}" + logging.error(error_message, exc_info=True) + return conversation_id, error_message + + +def save_chat_history(history, conversation_id, media_content): + try: + content, conversation_name = generate_chat_history_content(history, conversation_id, media_content) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_conversation_name = re.sub(r'[^a-zA-Z0-9_-]', '_', conversation_name) + base_filename = f"{safe_conversation_name}_{timestamp}.json" + + # Create a temporary file + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file: + temp_file.write(content) + temp_file_path = temp_file.name + + # Generate a unique filename + unique_filename = generate_unique_filename(os.path.dirname(temp_file_path), base_filename) + final_path = os.path.join(os.path.dirname(temp_file_path), unique_filename) + + # Rename the temporary file to the unique filename + os.rename(temp_file_path, final_path) + + return final_path + except Exception as e: + logging.error(f"Error saving chat history: {str(e)}") + return None + + +def generate_chat_history_content(history, conversation_id, media_content): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + conversation_name = get_conversation_name(conversation_id) + + if not conversation_name: + media_name = extract_media_name(media_content) + if media_name: + conversation_name = f"{media_name}-chat" + else: + conversation_name = f"chat-{timestamp}" # Fallback name + + chat_data = { + "conversation_id": conversation_id, + "conversation_name": conversation_name, + "timestamp": timestamp, + "history": [ + { + "role": "user" if i % 2 == 0 else "bot", + "content": msg[0] if isinstance(msg, tuple) else msg + } + for i, msg in enumerate(history) + ] + } + + return json.dumps(chat_data, indent=2), conversation_name + + +def extract_media_name(media_content): + if isinstance(media_content, dict): + content = media_content.get('content', {}) + if isinstance(content, str): + try: + content = json.loads(content) + except json.JSONDecodeError: + logging.warning("Failed to parse media_content JSON string") + return None + + # Try to extract title from the content + if isinstance(content, dict): + return content.get('title') or content.get('name') + + logging.warning(f"Unexpected media_content format: {type(media_content)}") + return None + + +def update_chat_content(selected_item, use_content, use_summary, use_prompt, item_mapping): + logging.debug(f"Debug - Update Chat Content - Selected Item: {selected_item}\n") + logging.debug(f"Debug - Update Chat Content - Use Content: {use_content}\n\n\n\n") + logging.debug(f"Debug - Update Chat Content - Use Summary: {use_summary}\n\n") + logging.debug(f"Debug - Update Chat Content - Use Prompt: {use_prompt}\n\n") + logging.debug(f"Debug - Update Chat Content - Item Mapping: {item_mapping}\n\n") + + if selected_item and selected_item in item_mapping: + media_id = item_mapping[selected_item] + content = load_media_content(media_id) + selected_parts = [] + if use_content and "content" in content: + selected_parts.append("content") + if use_summary and "summary" in content: + selected_parts.append("summary") + if use_prompt and "prompt" in content: + selected_parts.append("prompt") + + # Modified debug print + if isinstance(content, dict): + print(f"Debug - Update Chat Content - Content keys: {list(content.keys())}") + for key, value in content.items(): + print(f"Debug - Update Chat Content - {key} (first 500 char): {str(value)[:500]}\n\n\n\n") + else: + print(f"Debug - Update Chat Content - Content(first 500 char): {str(content)[:500]}\n\n\n\n") + + print(f"Debug - Update Chat Content - Selected Parts: {selected_parts}") + return content, selected_parts + else: + print(f"Debug - Update Chat Content - No item selected or item not in mapping") + return {}, [] + + +# +# End of Chat.py +########################################################################################################################## \ No newline at end of file diff --git a/App_Function_Libraries/Chat_related_functions.py b/App_Function_Libraries/Chat_related_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..38c6e71766fe87a35964cda60b2b4947f68a96e6 --- /dev/null +++ b/App_Function_Libraries/Chat_related_functions.py @@ -0,0 +1,41 @@ +# Chat_related_functions.py +# Contains functions related to chat +# WIP. +# +# Importing required libraries +import json +import os +from pathlib import Path +import json +# +######################################################################################################################## +# Set globals +CHARACTERS_FILE = Path('.', 'Helper_Scripts', 'Character_Cards', 'Characters.json') + +def save_character(character_data): + if CHARACTERS_FILE.exists(): + with CHARACTERS_FILE.open('r') as f: + characters = json.load(f) + else: + characters = {} + + characters[character_data['name']] = character_data + + with CHARACTERS_FILE.open('w') as f: + json.dump(characters, f, indent=2) + + +def load_characters(): + if os.path.exists(CHARACTERS_FILE): + with open(CHARACTERS_FILE, 'r') as f: + return json.load(f) + return {} + + +def get_character_names(): + characters = load_characters() + return list(characters.keys()) + + + + diff --git a/App_Function_Libraries/ChromaDB_Library.py b/App_Function_Libraries/ChromaDB_Library.py new file mode 100644 index 0000000000000000000000000000000000000000..49f79927368c6f1474065677063565cd73f13ec2 --- /dev/null +++ b/App_Function_Libraries/ChromaDB_Library.py @@ -0,0 +1,225 @@ +import configparser +import logging +import sqlite3 +from typing import List, Dict, Any + +import chromadb +import requests + +from App_Function_Libraries.Chunk_Lib import improved_chunking_process + +####################################################################################################################### +# +# Functions for ChromaDB + +# Get ChromaDB settings +# Load configuration +config = configparser.ConfigParser() +config.read('config.txt') +chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db') +chroma_client = chromadb.PersistentClient(path=chroma_db_path) + +# Get embedding settings +embedding_provider = config.get('Embeddings', 'provider', fallback='openai') +embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small') +embedding_api_key = config.get('Embeddings', 'api_key', fallback='') +embedding_api_url = config.get('Embeddings', 'api_url', fallback='') + +# Get chunking options +chunk_options = { + 'method': config.get('Chunking', 'method', fallback='words'), + 'max_size': config.getint('Chunking', 'max_size', fallback=400), + 'overlap': config.getint('Chunking', 'overlap', fallback=200), + 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False), + 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False), + 'language': config.get('Chunking', 'language', fallback='english') +} + + +def auto_update_chroma_embeddings(media_id: int, content: str): + """ + Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database. + + :param media_id: The ID of the newly ingested media item + :param content: The content of the newly ingested media item + """ + collection_name = f"media_{media_id}" + + # Initialize or get the ChromaDB collection + collection = chroma_client.get_or_create_collection(name=collection_name) + + # Check if embeddings already exist for this media_id + existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))]) + + if existing_embeddings and len(existing_embeddings) > 0: + logging.info(f"Embeddings already exist for media ID {media_id}, skipping...") + else: + # Process and store content if embeddings do not already exist + process_and_store_content(content, collection_name, media_id) + logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}") + + +# Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite +def process_and_store_content(content: str, collection_name: str, media_id: int): + # Process the content into chunks + chunks = improved_chunking_process(content, chunk_options) + texts = [chunk['text'] for chunk in chunks] + + # Generate embeddings for each chunk + embeddings = [create_embedding(text) for text in texts] + + # Create unique IDs for each chunk using the media_id and chunk index + ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))] + + # Store the texts, embeddings, and IDs in ChromaDB + store_in_chroma(collection_name, texts, embeddings, ids) + + # Store the chunks in SQLite FTS as well + from App_Function_Libraries.DB_Manager import db + with db.get_connection() as conn: + cursor = conn.cursor() + for text in texts: + cursor.execute("INSERT INTO media_fts (content) VALUES (?)", (text,)) + conn.commit() + + +# Function to store documents and their embeddings in ChromaDB +def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]): + collection = chroma_client.get_or_create_collection(name=collection_name) + collection.add( + documents=texts, + embeddings=embeddings, + ids=ids + ) + +# Function to perform vector search using ChromaDB +def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]: + query_embedding = create_embedding(query) + collection = chroma_client.get_collection(name=collection_name) + results = collection.query( + query_embeddings=[query_embedding], + n_results=k + ) + return results['documents'][0] + + +def create_embedding(text: str) -> List[float]: + if embedding_provider == 'openai': + import openai + openai.api_key = embedding_api_key + response = openai.Embedding.create(input=text, model=embedding_model) + return response['data'][0]['embedding'] + elif embedding_provider == 'local': + # FIXME - This is a placeholder for API calls to a local embedding model + response = requests.post( + embedding_api_url, + json={"text": text, "model": embedding_model}, + headers={"Authorization": f"Bearer {embedding_api_key}"} + ) + return response.json()['embedding'] + # FIXME - this seems correct, but idk.... + elif embedding_provider == 'huggingface': + from transformers import AutoTokenizer, AutoModel + import torch + + tokenizer = AutoTokenizer.from_pretrained(embedding_model) + model = AutoModel.from_pretrained(embedding_model) + + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) + with torch.no_grad(): + outputs = model(**inputs) + + # Use the mean of the last hidden state as the sentence embedding + embeddings = outputs.last_hidden_state.mean(dim=1) + return embeddings[0].tolist() # Convert to list for consistency + else: + raise ValueError(f"Unsupported embedding provider: {embedding_provider}") + + +def create_all_embeddings(api_choice: str) -> str: + try: + global embedding_provider + embedding_provider = api_choice + + all_content = get_all_content_from_database() + + if not all_content: + return "No content found in the database." + + texts_to_embed = [] + embeddings_to_store = [] + ids_to_store = [] + collection_name = "all_content_embeddings" + + # Initialize or get the ChromaDB collection + collection = chroma_client.get_or_create_collection(name=collection_name) + + for content_item in all_content: + media_id = content_item['id'] + text = content_item['content'] + + # Check if the embedding already exists in ChromaDB + embedding_exists = collection.get(ids=[f"doc_{media_id}"]) + + if embedding_exists: + logging.info(f"Embedding already exists for media ID {media_id}, skipping...") + continue # Skip if embedding already exists + + # Create the embedding + embedding = create_embedding(text) + + # Collect the text, embedding, and ID for batch storage + texts_to_embed.append(text) + embeddings_to_store.append(embedding) + ids_to_store.append(f"doc_{media_id}") + + # Store all new embeddings in ChromaDB + if texts_to_embed and embeddings_to_store: + store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store) + + return "Embeddings created and stored successfully for all new content." + except Exception as e: + logging.error(f"Error during embedding creation: {str(e)}") + return f"Error: {str(e)}" + + +def get_all_content_from_database() -> List[Dict[str, Any]]: + """ + Retrieve all media content from the database that requires embedding. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields. + """ + try: + from App_Function_Libraries.DB_Manager import db + with db.get_connection() as conn: + cursor = conn.cursor() + cursor.execute(""" + SELECT id, content, title, author, type + FROM Media + WHERE is_trash = 0 -- Exclude items marked as trash + """) + media_items = cursor.fetchall() + + # Convert the results into a list of dictionaries + all_content = [ + { + 'id': item[0], + 'content': item[1], + 'title': item[2], + 'author': item[3], + 'type': item[4] + } + for item in media_items + ] + + return all_content + + except sqlite3.Error as e: + logging.error(f"Error retrieving all content from database: {e}") + from App_Function_Libraries.SQLite_DB import DatabaseError + raise DatabaseError(f"Error retrieving all content from database: {e}") + +# +# End of Functions for ChromaDB +####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/Chunk_Lib.py b/App_Function_Libraries/Chunk_Lib.py index 837441e4dd15ad058186eec34b474bc229a29ffa..dbb052f8e16b079e75e31ecbbdff7971eb25a188 100644 --- a/App_Function_Libraries/Chunk_Lib.py +++ b/App_Function_Libraries/Chunk_Lib.py @@ -1,583 +1,587 @@ -# Chunk_Lib.py -######################################### -# Chunking Library -# This library is used to perform chunking of input files. -# Currently, uses naive approaches. Nothing fancy. -# -#### -# Import necessary libraries -import logging -import re - -from typing import List, Optional, Tuple, Dict, Any - -from openai import OpenAI -from tqdm import tqdm -# -# Import 3rd party -from transformers import GPT2Tokenizer -import nltk -from nltk.tokenize import sent_tokenize, word_tokenize -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity -# -# Import Local -from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize -from App_Function_Libraries.Utils import load_comprehensive_config - - -# -####################################################################################################################### -# Function Definitions -# - -# FIXME - Make sure it only downloads if it already exists, and does a check first. -# Ensure NLTK data is downloaded -def ntlk_prep(): - nltk.download('punkt') - -# Load GPT2 tokenizer -tokenizer = GPT2Tokenizer.from_pretrained("gpt2") - -# Load Config file for API keys -config = load_comprehensive_config() -openai_api_key = config.get('API', 'openai_api_key', fallback=None) - -def load_document(file_path): - with open(file_path, 'r') as file: - text = file.read() - return re.sub('\\s+', ' ', text).strip() - - -def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]: - chunk_method = chunk_options.get('method', 'words') - max_chunk_size = chunk_options.get('max_size', 300) - overlap = chunk_options.get('overlap', 0) - language = chunk_options.get('language', 'english') - adaptive = chunk_options.get('adaptive', False) - multi_level = chunk_options.get('multi_level', False) - - if adaptive: - max_chunk_size = adaptive_chunk_size(text, max_chunk_size) - - if multi_level: - chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language) - else: - if chunk_method == 'words': - chunks = chunk_text_by_words(text, max_chunk_size, overlap) - elif chunk_method == 'sentences': - chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language) - elif chunk_method == 'paragraphs': - chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap) - elif chunk_method == 'tokens': - chunks = chunk_text_by_tokens(text, max_chunk_size, overlap) - elif chunk_method == 'chapters': - return chunk_ebook_by_chapters(text, chunk_options) - else: - # No chunking applied - chunks = [text] - - return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks] - - -def adaptive_chunk_size(text: str, base_size: int) -> int: - # Simple adaptive logic: adjust chunk size based on text complexity - avg_word_length = sum(len(word) for word in text.split()) / len(text.split()) - if avg_word_length > 6: # Arbitrary threshold for "complex" text - return int(base_size * 0.8) # Reduce chunk size for complex text - return base_size - - -def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]: - # First level: chunk by paragraphs - paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap) - - # Second level: chunk each paragraph further - chunks = [] - for para in paragraphs: - if method == 'words': - chunks.extend(chunk_text_by_words(para, max_size, overlap)) - elif method == 'sentences': - chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language)) - else: - chunks.append(para) - - return chunks - - -def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]: - words = text.split() - chunks = [] - for i in range(0, len(words), max_words - overlap): - chunk = ' '.join(words[i:i + max_words]) - chunks.append(chunk) - return post_process_chunks(chunks) - - -def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[ - str]: - nltk.download('punkt', quiet=True) - sentences = nltk.sent_tokenize(text, language=language) - chunks = [] - for i in range(0, len(sentences), max_sentences - overlap): - chunk = ' '.join(sentences[i:i + max_sentences]) - chunks.append(chunk) - return post_process_chunks(chunks) - - -def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]: - paragraphs = re.split(r'\n\s*\n', text) - chunks = [] - for i in range(0, len(paragraphs), max_paragraphs - overlap): - chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs]) - chunks.append(chunk) - return post_process_chunks(chunks) - - -def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]: - # This is a simplified token-based chunking. For more accurate tokenization, - # consider using a proper tokenizer like GPT-2 TokenizerFast - words = text.split() - chunks = [] - current_chunk = [] - current_token_count = 0 - - for word in words: - word_token_count = len(word) // 4 + 1 # Rough estimate of token count - if current_token_count + word_token_count > max_tokens and current_chunk: - chunks.append(' '.join(current_chunk)) - current_chunk = current_chunk[-overlap:] if overlap > 0 else [] - current_token_count = sum(len(w) // 4 + 1 for w in current_chunk) - - current_chunk.append(word) - current_token_count += word_token_count - - if current_chunk: - chunks.append(' '.join(current_chunk)) - - return post_process_chunks(chunks) - - -def post_process_chunks(chunks: List[str]) -> List[str]: - return [chunk.strip() for chunk in chunks if chunk.strip()] - - -def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]: - start_index = full_text.index(chunk) - metadata = { - 'start_index': start_index, - 'end_index': start_index + len(chunk), - 'word_count': len(chunk.split()), - 'char_count': len(chunk), - 'chunk_type': chunk_type - } - if chunk_type == "chapter": - metadata['chapter_number'] = chapter_number - metadata['chapter_pattern'] = chapter_pattern - return metadata - - -# Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number -def chunk_text_hybrid(text, max_tokens=1000): - sentences = nltk.tokenize.sent_tokenize(text) - chunks = [] - current_chunk = [] - current_length = 0 - - for sentence in sentences: - tokens = tokenizer.encode(sentence) - if current_length + len(tokens) <= max_tokens: - current_chunk.append(sentence) - current_length += len(tokens) - else: - chunks.append(' '.join(current_chunk)) - current_chunk = [sentence] - current_length = len(tokens) - - if current_chunk: - chunks.append(' '.join(current_chunk)) - - return chunks - -# Thanks openai -def chunk_on_delimiter(input_string: str, - max_tokens: int, - delimiter: str) -> List[str]: - chunks = input_string.split(delimiter) - combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum( - chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True) - if dropped_chunk_count > 0: - print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.") - combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] - return combined_chunks - -# ????FIXME -def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None): - summarized_chunks = [] - current_summary = "" - - logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...") - logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}") - for i, chunk in enumerate(chunks): - if i == 0: - current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt) - else: - combined_text = current_summary + "\n\n" + chunk - current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt) - - summarized_chunks.append(current_summary) - - return summarized_chunks - - -# Sample text for testing -sample_text = """ -Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence -concerned with the interactions between computers and human language, in particular how to program computers -to process and analyze large amounts of natural language data. The result is a computer capable of "understanding" -the contents of documents, including the contextual nuances of the language within them. The technology can then -accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. - -Challenges in natural language processing frequently involve speech recognition, natural language understanding, -and natural language generation. - -Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled -"Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. -""" - -# Example usage of different chunking methods -# print("Chunking by words:") -# print(chunk_text_by_words(sample_text, max_words=50)) -# -# print("\nChunking by sentences:") -# print(chunk_text_by_sentences(sample_text, max_sentences=2)) -# -# print("\nChunking by paragraphs:") -# print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1)) -# -# print("\nChunking by tokens:") -# print(chunk_text_by_tokens(sample_text, max_tokens=50)) -# -# print("\nHybrid chunking:") -# print(chunk_text_hybrid(sample_text, max_tokens=50)) - - - -####################################################################################################################### -# -# Experimental Semantic Chunking -# - -# Chunk text into segments based on semantic similarity -def count_units(text, unit='tokens'): - if unit == 'words': - return len(text.split()) - elif unit == 'tokens': - return len(word_tokenize(text)) - elif unit == 'characters': - return len(text) - else: - raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.") - - -def semantic_chunking(text, max_chunk_size=2000, unit='words'): - nltk.download('punkt', quiet=True) - sentences = sent_tokenize(text) - vectorizer = TfidfVectorizer() - sentence_vectors = vectorizer.fit_transform(sentences) - - chunks = [] - current_chunk = [] - current_size = 0 - - for i, sentence in enumerate(sentences): - sentence_size = count_units(sentence, unit) - if current_size + sentence_size > max_chunk_size and current_chunk: - chunks.append(' '.join(current_chunk)) - overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap - current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap - current_size = overlap_size - - current_chunk.append(sentence) - current_size += sentence_size - - if i + 1 < len(sentences): - current_vector = sentence_vectors[i] - next_vector = sentence_vectors[i + 1] - similarity = cosine_similarity(current_vector, next_vector)[0][0] - if similarity < 0.5 and current_size >= max_chunk_size // 2: - chunks.append(' '.join(current_chunk)) - overlap_size = count_units(' '.join(current_chunk[-3:]), unit) - current_chunk = current_chunk[-3:] - current_size = overlap_size - - if current_chunk: - chunks.append(' '.join(current_chunk)) - - return chunks - - -def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100): - try: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - - chunks = semantic_chunking(content, max_chunk_size, overlap) - return chunks - except Exception as e: - logging.error(f"Error chunking text file: {str(e)}") - return None -####################################################################################################################### - - - - - - -####################################################################################################################### -# -# OpenAI Rolling Summarization -# - -client = OpenAI(api_key=openai_api_key) -def get_chat_completion(messages, model='gpt-4-turbo'): - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - return response.choices[0].message.content - - -# This function combines text chunks into larger blocks without exceeding a specified token count. -# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow. -def combine_chunks_with_no_minimum( - chunks: List[str], - max_tokens: int, - chunk_delimiter="\n\n", - header: Optional[str] = None, - add_ellipsis_for_overflow=False, -) -> Tuple[List[str], List[int]]: - dropped_chunk_count = 0 - output = [] # list to hold the final combined chunks - output_indices = [] # list to hold the indices of the final combined chunks - candidate = ( - [] if header is None else [header] - ) # list to hold the current combined chunk candidate - candidate_indices = [] - for chunk_i, chunk in enumerate(chunks): - chunk_with_header = [chunk] if header is None else [header, chunk] - # FIXME MAKE NOT OPENAI SPECIFIC - if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens: - print(f"warning: chunk overflow") - if ( - add_ellipsis_for_overflow - # FIXME MAKE NOT OPENAI SPECIFIC - and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens - ): - candidate.append("...") - dropped_chunk_count += 1 - continue # this case would break downstream assumptions - # estimate token count with the current chunk added - # FIXME MAKE NOT OPENAI SPECIFIC - extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk]))) - # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate - if extended_candidate_token_count > max_tokens: - output.append(chunk_delimiter.join(candidate)) - output_indices.append(candidate_indices) - candidate = chunk_with_header # re-initialize candidate - candidate_indices = [chunk_i] - # otherwise keep extending the candidate - else: - candidate.append(chunk) - candidate_indices.append(chunk_i) - # add the remaining candidate to output if it's not empty - if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0): - output.append(chunk_delimiter.join(candidate)) - output_indices.append(candidate_indices) - return output, output_indices, dropped_chunk_count - - -def rolling_summarize(text: str, - detail: float = 0, - model: str = 'gpt-4-turbo', - additional_instructions: Optional[str] = None, - minimum_chunk_size: Optional[int] = 500, - chunk_delimiter: str = ".", - summarize_recursively=False, - verbose=False): - """ - Summarizes a given text by splitting it into chunks, each of which is summarized individually. - The level of detail in the summary can be adjusted, and the process can optionally be made recursive. - - Parameters: - - text (str): The text to be summarized. - - detail (float, optional): A value between 0 and 1 - indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more - detailed summary. Defaults to 0. - - additional_instructions (Optional[str], optional): Additional instructions to provide to the - model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text - chunks. Defaults to 500. - - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".". - - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context. - - verbose (bool, optional): If True, prints detailed information about the chunking process. - Returns: - - str: The final compiled summary of the text. - - The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count - based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If - `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the - summarization process. The function returns a compiled summary of all chunks. - """ - - # check detail is set correctly - assert 0 <= detail <= 1 - - # interpolate the number of chunks based to get specified level of detail - max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter)) - min_chunks = 1 - num_chunks = int(min_chunks + detail * (max_chunks - min_chunks)) - - # adjust chunk_size based on interpolated number of chunks - # FIXME MAKE NOT OPENAI SPECIFIC - document_length = len(openai_tokenize(text)) - chunk_size = max(minimum_chunk_size, document_length // num_chunks) - text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter) - if verbose: - print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.") - # FIXME MAKE NOT OPENAI SPECIFIC - print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}") - - # set system message - FIXME - system_message_content = "Rewrite this text in summarized form." - if additional_instructions is not None: - system_message_content += f"\n\n{additional_instructions}" - - accumulated_summaries = [] - for i, chunk in enumerate(tqdm(text_chunks)): - if summarize_recursively and accumulated_summaries: - # Combine previous summary with current chunk for recursive summarization - combined_text = accumulated_summaries[-1] + "\n\n" + chunk - user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}" - else: - user_message_content = chunk - - messages = [ - {"role": "system", "content": system_message_content}, - {"role": "user", "content": user_message_content} - ] - - response = get_chat_completion(messages, model=model) - accumulated_summaries.append(response) - - final_summary = '\n\n'.join(accumulated_summaries) - return final_summary - -# -# -####################################################################################################################### -# -# Ebook Chapter Chunking - - -def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]: - max_chunk_size = chunk_options.get('max_size', 300) - overlap = chunk_options.get('overlap', 0) - custom_pattern = chunk_options.get('custom_chapter_pattern', None) - - # List of chapter heading patterns to try, in order - chapter_patterns = [ - custom_pattern, - r'^#{1,2}\s+', # Markdown style: '# ' or '## ' - r'^Chapter\s+\d+', # 'Chapter ' followed by numbers - r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc. - r'^[A-Z\s]+$' # All caps headings - ] - - chapter_positions = [] - used_pattern = None - - for pattern in chapter_patterns: - if pattern is None: - continue - chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE) - chapter_positions = [match.start() for match in chapter_regex.finditer(text)] - if chapter_positions: - used_pattern = pattern - break - - # If no chapters found, return the entire content as one chunk - if not chapter_positions: - return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}] - - # Split content into chapters - chunks = [] - for i in range(len(chapter_positions)): - start = chapter_positions[i] - end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None - chapter = text[start:end] - - # Apply overlap if specified - if overlap > 0 and i > 0: - overlap_start = max(0, start - overlap) - chapter = text[overlap_start:end] - - chunks.append(chapter) - - # Post-process chunks - processed_chunks = post_process_chunks(chunks) - - # Add metadata to chunks - return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1, - chapter_pattern=used_pattern)} - for i, chunk in enumerate(processed_chunks)] - - -# # Example usage -# if __name__ == "__main__": -# sample_ebook_content = """ -# # Chapter 1: Introduction -# -# This is the introduction. -# -# ## Section 1.1 -# -# Some content here. -# -# # Chapter 2: Main Content -# -# This is the main content. -# -# ## Section 2.1 -# -# More content here. -# -# CHAPTER THREE -# -# This is the third chapter. -# -# 4. Fourth Chapter -# -# This is the fourth chapter. -# """ -# -# chunk_options = { -# 'method': 'chapters', -# 'max_size': 500, -# 'overlap': 50, -# 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style -# } -# -# chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options) -# -# for i, chunk in enumerate(chunked_chapters, 1): -# print(f"Chunk {i}:") -# print(chunk['text']) -# print(f"Metadata: {chunk['metadata']}\n") - - - - -# -# End of Chunking Library +# Chunk_Lib.py +######################################### +# Chunking Library +# This library is used to perform chunking of input files. +# Currently, uses naive approaches. Nothing fancy. +# +#### +# Import necessary libraries +import logging +import re + +from typing import List, Optional, Tuple, Dict, Any + +from openai import OpenAI +from tqdm import tqdm +# +# Import 3rd party +from transformers import GPT2Tokenizer +import nltk +from nltk.tokenize import sent_tokenize, word_tokenize +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +# +# Import Local +from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize +from App_Function_Libraries.Utils import load_comprehensive_config + + +# +####################################################################################################################### +# Function Definitions +# + +# FIXME - Make sure it only downloads if it already exists, and does a check first. +# Ensure NLTK data is downloaded +def ntlk_prep(): + nltk.download('punkt') + +# Load GPT2 tokenizer +tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + +# Load Config file for API keys +config = load_comprehensive_config() +openai_api_key = config.get('API', 'openai_api_key', fallback=None) + +def load_document(file_path): + with open(file_path, 'r') as file: + text = file.read() + return re.sub('\\s+', ' ', text).strip() + + +def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]: + chunk_method = chunk_options.get('method', 'words') + max_chunk_size = chunk_options.get('max_size', 300) + overlap = chunk_options.get('overlap', 0) + language = chunk_options.get('language', 'english') + adaptive = chunk_options.get('adaptive', False) + multi_level = chunk_options.get('multi_level', False) + + if adaptive: + max_chunk_size = adaptive_chunk_size(text, max_chunk_size) + + if multi_level: + chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language) + else: + if chunk_method == 'words': + chunks = chunk_text_by_words(text, max_chunk_size, overlap) + elif chunk_method == 'sentences': + chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language) + elif chunk_method == 'paragraphs': + chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap) + elif chunk_method == 'tokens': + chunks = chunk_text_by_tokens(text, max_chunk_size, overlap) + elif chunk_method == 'chapters': + return chunk_ebook_by_chapters(text, chunk_options) + else: + # No chunking applied + chunks = [text] + + return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks] + + +def adaptive_chunk_size(text: str, base_size: int) -> int: + # Simple adaptive logic: adjust chunk size based on text complexity + avg_word_length = sum(len(word) for word in text.split()) / len(text.split()) + if avg_word_length > 6: # Arbitrary threshold for "complex" text + return int(base_size * 0.8) # Reduce chunk size for complex text + return base_size + + +def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]: + # First level: chunk by paragraphs + paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap) + + # Second level: chunk each paragraph further + chunks = [] + for para in paragraphs: + if method == 'words': + chunks.extend(chunk_text_by_words(para, max_size, overlap)) + elif method == 'sentences': + chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language)) + else: + chunks.append(para) + + return chunks + + +def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]: + words = text.split() + chunks = [] + for i in range(0, len(words), max_words - overlap): + chunk = ' '.join(words[i:i + max_words]) + chunks.append(chunk) + return post_process_chunks(chunks) + + +def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[ + str]: + nltk.download('punkt', quiet=True) + sentences = nltk.sent_tokenize(text, language=language) + chunks = [] + for i in range(0, len(sentences), max_sentences - overlap): + chunk = ' '.join(sentences[i:i + max_sentences]) + chunks.append(chunk) + return post_process_chunks(chunks) + + +def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]: + paragraphs = re.split(r'\n\s*\n', text) + chunks = [] + for i in range(0, len(paragraphs), max_paragraphs - overlap): + chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs]) + chunks.append(chunk) + return post_process_chunks(chunks) + + +def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]: + # This is a simplified token-based chunking. For more accurate tokenization, + # consider using a proper tokenizer like GPT-2 TokenizerFast + words = text.split() + chunks = [] + current_chunk = [] + current_token_count = 0 + + for word in words: + word_token_count = len(word) // 4 + 1 # Rough estimate of token count + if current_token_count + word_token_count > max_tokens and current_chunk: + chunks.append(' '.join(current_chunk)) + current_chunk = current_chunk[-overlap:] if overlap > 0 else [] + current_token_count = sum(len(w) // 4 + 1 for w in current_chunk) + + current_chunk.append(word) + current_token_count += word_token_count + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return post_process_chunks(chunks) + + +def post_process_chunks(chunks: List[str]) -> List[str]: + return [chunk.strip() for chunk in chunks if chunk.strip()] + + +def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]: + try: + start_index = full_text.index(chunk) + metadata = { + 'start_index': start_index, + 'end_index': start_index + len(chunk), + 'word_count': len(chunk.split()), + 'char_count': len(chunk), + 'chunk_type': chunk_type + } + if chunk_type == "chapter": + metadata['chapter_number'] = chapter_number + metadata['chapter_pattern'] = chapter_pattern + return metadata + except ValueError as e: + logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}") + raise + + +# Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number +def chunk_text_hybrid(text, max_tokens=1000): + sentences = nltk.tokenize.sent_tokenize(text) + chunks = [] + current_chunk = [] + current_length = 0 + + for sentence in sentences: + tokens = tokenizer.encode(sentence) + if current_length + len(tokens) <= max_tokens: + current_chunk.append(sentence) + current_length += len(tokens) + else: + chunks.append(' '.join(current_chunk)) + current_chunk = [sentence] + current_length = len(tokens) + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks + +# Thanks openai +def chunk_on_delimiter(input_string: str, + max_tokens: int, + delimiter: str) -> List[str]: + chunks = input_string.split(delimiter) + combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum( + chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True) + if dropped_chunk_count > 0: + print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.") + combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] + return combined_chunks + +# ????FIXME +def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None): + summarized_chunks = [] + current_summary = "" + + logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...") + logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}") + for i, chunk in enumerate(chunks): + if i == 0: + current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt) + else: + combined_text = current_summary + "\n\n" + chunk + current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt) + + summarized_chunks.append(current_summary) + + return summarized_chunks + + +# Sample text for testing +sample_text = """ +Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence +concerned with the interactions between computers and human language, in particular how to program computers +to process and analyze large amounts of natural language data. The result is a computer capable of "understanding" +the contents of documents, including the contextual nuances of the language within them. The technology can then +accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. + +Challenges in natural language processing frequently involve speech recognition, natural language understanding, +and natural language generation. + +Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled +"Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. +""" + +# Example usage of different chunking methods +# print("Chunking by words:") +# print(chunk_text_by_words(sample_text, max_words=50)) +# +# print("\nChunking by sentences:") +# print(chunk_text_by_sentences(sample_text, max_sentences=2)) +# +# print("\nChunking by paragraphs:") +# print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1)) +# +# print("\nChunking by tokens:") +# print(chunk_text_by_tokens(sample_text, max_tokens=50)) +# +# print("\nHybrid chunking:") +# print(chunk_text_hybrid(sample_text, max_tokens=50)) + + + +####################################################################################################################### +# +# Experimental Semantic Chunking +# + +# Chunk text into segments based on semantic similarity +def count_units(text, unit='tokens'): + if unit == 'words': + return len(text.split()) + elif unit == 'tokens': + return len(word_tokenize(text)) + elif unit == 'characters': + return len(text) + else: + raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.") + + +def semantic_chunking(text, max_chunk_size=2000, unit='words'): + nltk.download('punkt', quiet=True) + sentences = sent_tokenize(text) + vectorizer = TfidfVectorizer() + sentence_vectors = vectorizer.fit_transform(sentences) + + chunks = [] + current_chunk = [] + current_size = 0 + + for i, sentence in enumerate(sentences): + sentence_size = count_units(sentence, unit) + if current_size + sentence_size > max_chunk_size and current_chunk: + chunks.append(' '.join(current_chunk)) + overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap + current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap + current_size = overlap_size + + current_chunk.append(sentence) + current_size += sentence_size + + if i + 1 < len(sentences): + current_vector = sentence_vectors[i] + next_vector = sentence_vectors[i + 1] + similarity = cosine_similarity(current_vector, next_vector)[0][0] + if similarity < 0.5 and current_size >= max_chunk_size // 2: + chunks.append(' '.join(current_chunk)) + overlap_size = count_units(' '.join(current_chunk[-3:]), unit) + current_chunk = current_chunk[-3:] + current_size = overlap_size + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks + + +def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100): + try: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + chunks = semantic_chunking(content, max_chunk_size, overlap) + return chunks + except Exception as e: + logging.error(f"Error chunking text file: {str(e)}") + return None +####################################################################################################################### + + + + + + +####################################################################################################################### +# +# OpenAI Rolling Summarization +# + +client = OpenAI(api_key=openai_api_key) +def get_chat_completion(messages, model='gpt-4-turbo'): + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + return response.choices[0].message.content + + +# This function combines text chunks into larger blocks without exceeding a specified token count. +# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow. +def combine_chunks_with_no_minimum( + chunks: List[str], + max_tokens: int, + chunk_delimiter="\n\n", + header: Optional[str] = None, + add_ellipsis_for_overflow=False, +) -> Tuple[List[str], List[int]]: + dropped_chunk_count = 0 + output = [] # list to hold the final combined chunks + output_indices = [] # list to hold the indices of the final combined chunks + candidate = ( + [] if header is None else [header] + ) # list to hold the current combined chunk candidate + candidate_indices = [] + for chunk_i, chunk in enumerate(chunks): + chunk_with_header = [chunk] if header is None else [header, chunk] + # FIXME MAKE NOT OPENAI SPECIFIC + if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens: + print(f"warning: chunk overflow") + if ( + add_ellipsis_for_overflow + # FIXME MAKE NOT OPENAI SPECIFIC + and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens + ): + candidate.append("...") + dropped_chunk_count += 1 + continue # this case would break downstream assumptions + # estimate token count with the current chunk added + # FIXME MAKE NOT OPENAI SPECIFIC + extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk]))) + # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate + if extended_candidate_token_count > max_tokens: + output.append(chunk_delimiter.join(candidate)) + output_indices.append(candidate_indices) + candidate = chunk_with_header # re-initialize candidate + candidate_indices = [chunk_i] + # otherwise keep extending the candidate + else: + candidate.append(chunk) + candidate_indices.append(chunk_i) + # add the remaining candidate to output if it's not empty + if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0): + output.append(chunk_delimiter.join(candidate)) + output_indices.append(candidate_indices) + return output, output_indices, dropped_chunk_count + + +def rolling_summarize(text: str, + detail: float = 0, + model: str = 'gpt-4-turbo', + additional_instructions: Optional[str] = None, + minimum_chunk_size: Optional[int] = 500, + chunk_delimiter: str = ".", + summarize_recursively=False, + verbose=False): + """ + Summarizes a given text by splitting it into chunks, each of which is summarized individually. + The level of detail in the summary can be adjusted, and the process can optionally be made recursive. + + Parameters: + - text (str): The text to be summarized. + - detail (float, optional): A value between 0 and 1 + indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more + detailed summary. Defaults to 0. + - additional_instructions (Optional[str], optional): Additional instructions to provide to the + model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text + chunks. Defaults to 500. + - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".". + - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context. + - verbose (bool, optional): If True, prints detailed information about the chunking process. + Returns: + - str: The final compiled summary of the text. + + The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count + based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If + `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the + summarization process. The function returns a compiled summary of all chunks. + """ + + # check detail is set correctly + assert 0 <= detail <= 1 + + # interpolate the number of chunks based to get specified level of detail + max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter)) + min_chunks = 1 + num_chunks = int(min_chunks + detail * (max_chunks - min_chunks)) + + # adjust chunk_size based on interpolated number of chunks + # FIXME MAKE NOT OPENAI SPECIFIC + document_length = len(openai_tokenize(text)) + chunk_size = max(minimum_chunk_size, document_length // num_chunks) + text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter) + if verbose: + print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.") + # FIXME MAKE NOT OPENAI SPECIFIC + print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}") + + # set system message - FIXME + system_message_content = "Rewrite this text in summarized form." + if additional_instructions is not None: + system_message_content += f"\n\n{additional_instructions}" + + accumulated_summaries = [] + for i, chunk in enumerate(tqdm(text_chunks)): + if summarize_recursively and accumulated_summaries: + # Combine previous summary with current chunk for recursive summarization + combined_text = accumulated_summaries[-1] + "\n\n" + chunk + user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}" + else: + user_message_content = chunk + + messages = [ + {"role": "system", "content": system_message_content}, + {"role": "user", "content": user_message_content} + ] + + response = get_chat_completion(messages, model=model) + accumulated_summaries.append(response) + + final_summary = '\n\n'.join(accumulated_summaries) + return final_summary + +# +# +####################################################################################################################### +# +# Ebook Chapter Chunking + + +def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]: + max_chunk_size = chunk_options.get('max_size', 300) + overlap = chunk_options.get('overlap', 0) + custom_pattern = chunk_options.get('custom_chapter_pattern', None) + + # List of chapter heading patterns to try, in order + chapter_patterns = [ + custom_pattern, + r'^#{1,2}\s+', # Markdown style: '# ' or '## ' + r'^Chapter\s+\d+', # 'Chapter ' followed by numbers + r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc. + r'^[A-Z\s]+$' # All caps headings + ] + + chapter_positions = [] + used_pattern = None + + for pattern in chapter_patterns: + if pattern is None: + continue + chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE) + chapter_positions = [match.start() for match in chapter_regex.finditer(text)] + if chapter_positions: + used_pattern = pattern + break + + # If no chapters found, return the entire content as one chunk + if not chapter_positions: + return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}] + + # Split content into chapters + chunks = [] + for i in range(len(chapter_positions)): + start = chapter_positions[i] + end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None + chapter = text[start:end] + + # Apply overlap if specified + if overlap > 0 and i > 0: + overlap_start = max(0, start - overlap) + chapter = text[overlap_start:end] + + chunks.append(chapter) + + # Post-process chunks + processed_chunks = post_process_chunks(chunks) + + # Add metadata to chunks + return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1, + chapter_pattern=used_pattern)} + for i, chunk in enumerate(processed_chunks)] + + +# # Example usage +# if __name__ == "__main__": +# sample_ebook_content = """ +# # Chapter 1: Introduction +# +# This is the introduction. +# +# ## Section 1.1 +# +# Some content here. +# +# # Chapter 2: Main Content +# +# This is the main content. +# +# ## Section 2.1 +# +# More content here. +# +# CHAPTER THREE +# +# This is the third chapter. +# +# 4. Fourth Chapter +# +# This is the fourth chapter. +# """ +# +# chunk_options = { +# 'method': 'chapters', +# 'max_size': 500, +# 'overlap': 50, +# 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style +# } +# +# chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options) +# +# for i, chunk in enumerate(chunked_chapters, 1): +# print(f"Chunk {i}:") +# print(chunk['text']) +# print(f"Metadata: {chunk['metadata']}\n") + + + + +# +# End of Chunking Library ####################################################################################################################### \ No newline at end of file diff --git a/App_Function_Libraries/DB_Manager.py b/App_Function_Libraries/DB_Manager.py new file mode 100644 index 0000000000000000000000000000000000000000..b31a476f8b15655fc7556cd32408d1687371524d --- /dev/null +++ b/App_Function_Libraries/DB_Manager.py @@ -0,0 +1,472 @@ +import configparser +import logging +import os +from contextlib import contextmanager +from time import sleep +from typing import Tuple +import sqlite3 +# 3rd-Party Libraries +from elasticsearch import Elasticsearch + +############################################################################################################ +# +# This file contains the DatabaseManager class, which is responsible for managing the database connection, i.e. either SQLite or Elasticsearch. + +#### +# The DatabaseManager class provides the following methods: +# - add_media: Add a new media item to the database +# - fetch_items_by_keyword: Fetch media items from the database based on a keyword +# - fetch_item_details: Fetch details of a specific media item from the database +# - update_media_content: Update the content of a specific media item in the database +# - search_and_display_items: Search for media items in the database and display the results +# - close_connection: Close the database connection +#### + +# Import your existing SQLite functions +from SQLite_DB import ( + update_media_content as sqlite_update_media_content, + list_prompts as sqlite_list_prompts, + search_and_display as sqlite_search_and_display, + fetch_prompt_details as sqlite_fetch_prompt_details, + keywords_browser_interface as sqlite_keywords_browser_interface, + add_keyword as sqlite_add_keyword, + delete_keyword as sqlite_delete_keyword, + export_keywords_to_csv as sqlite_export_keywords_to_csv, + ingest_article_to_db as sqlite_ingest_article_to_db, + add_media_to_database as sqlite_add_media_to_database, + import_obsidian_note_to_db as sqlite_import_obsidian_note_to_db, + add_prompt as sqlite_add_prompt, + delete_chat_message as sqlite_delete_chat_message, + update_chat_message as sqlite_update_chat_message, + add_chat_message as sqlite_add_chat_message, + get_chat_messages as sqlite_get_chat_messages, + search_chat_conversations as sqlite_search_chat_conversations, + create_chat_conversation as sqlite_create_chat_conversation, + save_chat_history_to_database as sqlite_save_chat_history_to_database, + view_database as sqlite_view_database, + get_transcripts as sqlite_get_transcripts, + get_trashed_items as sqlite_get_trashed_items, + user_delete_item as sqlite_user_delete_item, + empty_trash as sqlite_empty_trash, + create_automated_backup as sqlite_create_automated_backup, + add_or_update_prompt as sqlite_add_or_update_prompt, + load_prompt_details as sqlite_load_prompt_details, + load_preset_prompts as sqlite_load_preset_prompts, + insert_prompt_to_db as sqlite_insert_prompt_to_db, + delete_prompt as sqlite_delete_prompt, + search_and_display_items as sqlite_search_and_display_items, + get_conversation_name as sqlite_get_conversation_name, + add_media_with_keywords as sqlite_add_media_with_keywords, + check_media_and_whisper_model as sqlite_check_media_and_whisper_model, + DatabaseError +) + +class Database: + def __init__(self, db_path=None): + self.db_path = db_path or os.getenv('DB_NAME', 'media_summary.db') + self.pool = [] + self.pool_size = 10 + + @contextmanager + def get_connection(self): + retry_count = 5 + retry_delay = 1 + conn = None + while retry_count > 0: + try: + conn = self.pool.pop() if self.pool else sqlite3.connect(self.db_path, check_same_thread=False) + yield conn + self.pool.append(conn) + return + except sqlite3.OperationalError as e: + if 'database is locked' in str(e): + logging.warning(f"Database is locked, retrying in {retry_delay} seconds...") + retry_count -= 1 + sleep(retry_delay) + else: + raise DatabaseError(f"Database error: {e}") + except Exception as e: + raise DatabaseError(f"Unexpected error: {e}") + finally: + # Ensure the connection is returned to the pool even on failure + if conn and conn not in self.pool: + self.pool.append(conn) + raise DatabaseError("Database is locked and retries have been exhausted") + + def execute_query(self, query: str, params: Tuple = ()) -> None: + with self.get_connection() as conn: + try: + cursor = conn.cursor() + cursor.execute(query, params) + conn.commit() + except sqlite3.Error as e: + raise DatabaseError(f"Database error: {e}, Query: {query}") + + def close_all_connections(self): + for conn in self.pool: + conn.close() + self.pool.clear() + +def get_db_config(): + config = configparser.ConfigParser() + config.read('config.txt') + return { + 'type': config['Database']['type'], + 'sqlite_path': config.get('Database', 'sqlite_path', fallback='media_summary.db'), + 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'), + 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200) + } + +db_config = get_db_config() +db_type = db_config['type'] + +if db_type == 'sqlite': + # Use the config path if provided, otherwise fall back to default + db = Database(db_config.get('sqlite_path')) +elif db_type == 'elasticsearch': + es = Elasticsearch([{ + 'host': db_config['elasticsearch_host'], + 'port': db_config['elasticsearch_port'] + }]) +else: + raise ValueError(f"Unsupported database type: {db_type}") + +db_path = db_config['sqlite_path'] + +# Update this path to the directory where you want to store the database backups +backup_dir = os.environ.get('DB_BACKUP_DIR', 'path/to/backup/directory') + + + + +if db_type == 'sqlite': + conn = sqlite3.connect(db_config['sqlite_path']) + cursor = conn.cursor() +elif db_type == 'elasticsearch': + es = Elasticsearch([{ + 'host': db_config['elasticsearch_host'], + 'port': db_config['elasticsearch_port'] + }]) +else: + raise ValueError(f"Unsupported database type: {db_type}") + +############################################################################################################ +# +# DB-Searching functions + +def view_database(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_view_database(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def search_and_display_items(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_search_and_display_items(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def search_and_display(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_search_and_display(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of DB-Searching functions +############################################################################################################ + +############################################################################################################ +# +# Transcript-related Functions + +def get_transcripts(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_get_transcripts(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of Transcript-related Functions +############################################################################################################ + +############################################################################################################ +# +# DB-Ingestion functions + +def add_media_to_database(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_add_media_to_database(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + + +def import_obsidian_note_to_db(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_import_obsidian_note_to_db(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def update_media_content(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_update_media_content(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def add_media_with_keywords(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_add_media_with_keywords(*args, **kwargs) + elif db_type == 'elasticsearch': + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def check_media_and_whisper_model(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_check_media_and_whisper_model(*args, **kwargs) + elif db_type == 'elasticsearch': + raise NotImplementedError("Elasticsearch version of check_media_and_whisper_model not yet implemented") + +def ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt): + if db_type == 'sqlite': + return sqlite_ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of ingest_article_to_db not yet implemented") + else: + raise ValueError(f"Unsupported database type: {db_type}") + +# +# End of DB-Ingestion functions +############################################################################################################ + + +############################################################################################################ +# +# Prompt-related functions + +def list_prompts(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_list_prompts(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + + +def fetch_prompt_details(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_fetch_prompt_details(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def add_prompt(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_add_prompt(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + + +def add_or_update_prompt(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_add_or_update_prompt(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def load_prompt_details(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_load_prompt_details(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def load_preset_prompts(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_load_preset_prompts(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def insert_prompt_to_db(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_insert_prompt_to_db(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def delete_prompt(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_delete_prompt(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of Prompt-related functions +############################################################################################################ + +############################################################################################################ +# +# Keywords-related Functions + +def keywords_browser_interface(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_keywords_browser_interface(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def add_keyword(*args, **kwargs): + if db_type == 'sqlite': + with db.get_connection() as conn: + cursor = conn.cursor() + return sqlite_add_keyword(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def delete_keyword(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_delete_keyword(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def export_keywords_to_csv(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_export_keywords_to_csv(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of Keywords-related Functions +############################################################################################################ + +############################################################################################################ +# +# Chat-related Functions + +def delete_chat_message(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_delete_chat_message(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def update_chat_message(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_update_chat_message(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def add_chat_message(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_add_chat_message(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def get_chat_messages(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_get_chat_messages(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def search_chat_conversations(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_search_chat_conversations(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def create_chat_conversation(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_create_chat_conversation(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def save_chat_history_to_database(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_save_chat_history_to_database(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def get_conversation_name(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_get_conversation_name(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of Chat-related Functions +############################################################################################################ + +############################################################################################################ +# +# Trash-related Functions + +def get_trashed_items(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_get_trashed_items(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def user_delete_item(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_user_delete_item(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +def empty_trash(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_empty_trash(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of Trash-related Functions +############################################################################################################ + +############################################################################################################ +# +# DB-Backup Functions + +def create_automated_backup(*args, **kwargs): + if db_type == 'sqlite': + return sqlite_create_automated_backup(*args, **kwargs) + elif db_type == 'elasticsearch': + # Implement Elasticsearch version + raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented") + +# +# End of DB-Backup Functions +############################################################################################################ + +############################################################################################################ +# +# Function to close the database connection for SQLite + +def close_connection(): + if db_type == 'sqlite': + db.close_all_connections() + # Elasticsearch doesn't need explicit closing + +# +# End of file +############################################################################################################ diff --git a/App_Function_Libraries/Diarization_Lib.py b/App_Function_Libraries/Diarization_Lib.py index d4dc035c06123adf5dcc220bca09ff34da6284cd..6b665934c9325f799de0c285d2dcbbc37e18edd2 100644 --- a/App_Function_Libraries/Diarization_Lib.py +++ b/App_Function_Libraries/Diarization_Lib.py @@ -33,7 +33,7 @@ import yaml def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization: path_to_config = Path(path_to_config).resolve() - print(f"Loading pyannote pipeline from {path_to_config}...") + logging.debug(f"Loading pyannote pipeline from {path_to_config}...") if not path_to_config.exists(): raise FileNotFoundError(f"Config file not found: {path_to_config}") @@ -45,11 +45,6 @@ def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarizat # Store current working directory cwd = Path.cwd().resolve() - # Change to the directory containing the config file - cd_to = path_to_config.parent.resolve() - print(f"Changing working directory to {cd_to}") - os.chdir(cd_to) - try: # Create a SpeakerDiarization pipeline pipeline = SpeakerDiarization() diff --git a/App_Function_Libraries/Gradio_Related.py b/App_Function_Libraries/Gradio_Related.py index 04d34b7e113a525fc222d5a26adcdfab20e189a1..c384581439dbb2a55bdb739bf0e4bb2bf141a465 100644 --- a/App_Function_Libraries/Gradio_Related.py +++ b/App_Function_Libraries/Gradio_Related.py @@ -1,5448 +1,183 @@ # Gradio_Related.py ######################################### # Gradio UI Functions Library -# This library is used to hold all UI-related functions for Gradio. # I fucking hate Gradio. # -##### -# Functions: -# -# download_audio_file(url, save_path) -# process_audio( -# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None) -# -# -######################################### -# -# Built-In Imports -import glob -import html -import math -import re -import shutil -import tempfile -import uuid -import zipfile -from datetime import datetime -import json -import logging -import os.path -from pathlib import Path -import sqlite3 -from time import sleep -from typing import Dict, List, Tuple, Optional -import traceback -from functools import wraps - -# -# Import 3rd-Party Libraries -import pypandoc -import yt_dlp -import gradio as gr -# -# Local Imports -from App_Function_Libraries.Article_Summarization_Lib import scrape_and_summarize_multiple -from App_Function_Libraries.Audio_Files import process_audio_files, process_podcast, download_youtube_audio -from App_Function_Libraries.Chunk_Lib import improved_chunking_process -from App_Function_Libraries.PDF_Ingestion_Lib import process_and_cleanup_pdf, extract_text_and_format_from_pdf, \ - extract_metadata_from_pdf -from App_Function_Libraries.Local_LLM_Inference_Engine_Lib import local_llm_gui_function -from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \ - summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm, \ - summarize_with_ollama -from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai, summarize_with_cohere, \ - summarize_with_anthropic, summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, \ - summarize_with_huggingface, perform_summarization, save_transcription_and_summary, \ - perform_transcription, summarize_chunk -from App_Function_Libraries.SQLite_DB import update_media_content, list_prompts, search_and_display, db, DatabaseError, \ - fetch_prompt_details, keywords_browser_interface, add_keyword, delete_keyword, \ - export_keywords_to_csv, add_media_to_database, import_obsidian_note_to_db, add_prompt, \ - delete_chat_message, update_chat_message, add_chat_message, get_chat_messages, search_chat_conversations, \ - create_chat_conversation, save_chat_history_to_database, view_database, get_transcripts, get_trashed_items, \ - user_delete_item, empty_trash, create_automated_backup, backup_dir, db_path, add_or_update_prompt, \ - load_prompt_details, load_preset_prompts, insert_prompt_to_db, delete_prompt, search_and_display_items, \ - get_conversation_name -from App_Function_Libraries.Utils import sanitize_filename, extract_text_from_segments, create_download_directory, \ - convert_to_seconds, load_comprehensive_config, safe_read_file, downloaded_files, generate_unique_identifier, \ - generate_unique_filename -from App_Function_Libraries.Video_DL_Ingestion_Lib import parse_and_expand_urls, \ - generate_timestamped_url, extract_metadata, download_video - -# -####################################################################################################################### -# Function Definitions -# - -whisper_models = ["small", "medium", "small.en", "medium.en", "medium", "large", "large-v1", "large-v2", "large-v3", - "distil-large-v2", "distil-medium.en", "distil-small.en"] -custom_prompt_input = None -server_mode = False -share_public = False -custom_prompt_summarize_bulleted_notes = (""" - You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST] - **Bulleted Note Creation Guidelines** - - **Headings**: - - Based on referenced topics, not categories like quotes or terms - - Surrounded by **bold** formatting - - Not listed as bullet points - - No space between headings and list items underneath - - **Emphasis**: - - **Important terms** set in bold font - - **Text ending in a colon**: also bolded - - **Review**: - - Ensure adherence to specified format - - Do not reference these instructions in your response.[INST] {{ .Prompt }} [/INST] - """) - - -def gradio_download_youtube_video(url): - try: - # Determine ffmpeg path based on the operating system. - ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg' - - # Create a temporary directory - with tempfile.TemporaryDirectory() as temp_dir: - # Extract information about the video - with yt_dlp.YoutubeDL({'quiet': True}) as ydl: - info_dict = ydl.extract_info(url, download=False) - sanitized_title = sanitize_filename(info_dict['title']) - original_ext = info_dict['ext'] - - # Setup the temporary filename - temp_file_path = Path(temp_dir) / f"{sanitized_title}.{original_ext}" - - # Initialize yt-dlp with generic options and the output template - ydl_opts = { - 'format': 'bestvideo+bestaudio/best', - 'ffmpeg_location': ffmpeg_path, - 'outtmpl': str(temp_file_path), - 'noplaylist': True, - 'quiet': True - } - - # Execute yt-dlp to download the video - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - ydl.download([url]) - - # Final check to ensure file exists - if not temp_file_path.exists(): - raise FileNotFoundError(f"Expected file was not found: {temp_file_path}") - - # Create a persistent directory for the download if it doesn't exist - persistent_dir = Path("downloads") - persistent_dir.mkdir(exist_ok=True) - - # Move the file from the temporary directory to the persistent directory - persistent_file_path = persistent_dir / f"{sanitized_title}.{original_ext}" - shutil.move(str(temp_file_path), str(persistent_file_path)) - - # Add the file to the list of downloaded files - downloaded_files.append(str(persistent_file_path)) - - return str(persistent_file_path), f"Video downloaded successfully: {sanitized_title}.{original_ext}" - except Exception as e: - return None, f"Error downloading video: {str(e)}" - - -def format_transcription(content): - # Replace '\n' with actual line breaks - content = content.replace('\\n', '\n') - # Split the content by newlines first - lines = content.split('\n') - formatted_lines = [] - for line in lines: - # Add extra space after periods for better readability - line = line.replace('.', '. ').replace('. ', '. ') - - # Split into sentences using a more comprehensive regex - sentences = re.split('(?<=[.!?]) +', line) - - # Trim whitespace from each sentence and add a line break - formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()] - - # Join the formatted sentences - formatted_lines.append(' '.join(formatted_sentences)) - - # Join the lines with HTML line breaks - formatted_content = '
'.join(formatted_lines) - - return formatted_content - - -def format_file_path(file_path, fallback_path=None): - if file_path and os.path.exists(file_path): - logging.debug(f"File exists: {file_path}") - return file_path - elif fallback_path and os.path.exists(fallback_path): - logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}") - return fallback_path - else: - logging.debug(f"File does not exist: {file_path}. No fallback path available.") - return None - - -def search_media(query, fields, keyword, page): - try: - results = search_and_display(query, fields, keyword, page) - return results - except Exception as e: - logger = logging.getLogger() - logger.error(f"Error searching media: {e}") - return str(e) - - - - -# Sample data -prompts_category_1 = [ - "What are the key points discussed in the video?", - "Summarize the main arguments made by the speaker.", - "Describe the conclusions of the study presented." -] - -prompts_category_2 = [ - "How does the proposed solution address the problem?", - "What are the implications of the findings?", - "Can you explain the theory behind the observed phenomenon?" -] - -all_prompts = prompts_category_1 + prompts_category_2 - - - - - -# Handle prompt selection -def handle_prompt_selection(prompt): - return f"You selected: {prompt}" - -# FIXME - Dead code? -# def display_details(media_id): -# if media_id: -# details = display_item_details(media_id) -# details_html = "" -# for detail in details: -# details_html += f"

Prompt:

{detail[0]}

" -# details_html += f"

Summary:

{detail[1]}

" -# -# # Format the transcription -# formatted_transcription = format_transcription(detail[2]) -# -# # Use
 tag with style for better formatting
-#             details_html += f"

Transcription:

{formatted_transcription}

" -# -# return details_html -# return "No details available." - - -def fetch_items_by_title_or_url(search_query: str, search_type: str): - try: - with db.get_connection() as conn: - cursor = conn.cursor() - if search_type == 'Title': - cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',)) - elif search_type == 'URL': - cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',)) - results = cursor.fetchall() - return results - except sqlite3.Error as e: - raise DatabaseError(f"Error fetching items by {search_type}: {e}") - - -def fetch_items_by_keyword(search_query: str): - try: - with db.get_connection() as conn: - cursor = conn.cursor() - cursor.execute(""" - SELECT m.id, m.title, m.url - FROM Media m - JOIN MediaKeywords mk ON m.id = mk.media_id - JOIN Keywords k ON mk.keyword_id = k.id - WHERE k.keyword LIKE ? - """, (f'%{search_query}%',)) - results = cursor.fetchall() - return results - except sqlite3.Error as e: - raise DatabaseError(f"Error fetching items by keyword: {e}") - - -def fetch_items_by_content(search_query: str): - try: - with db.get_connection() as conn: - cursor = conn.cursor() - cursor.execute("SELECT id, title, url FROM Media WHERE content LIKE ?", (f'%{search_query}%',)) - results = cursor.fetchall() - return results - except sqlite3.Error as e: - raise DatabaseError(f"Error fetching items by content: {e}") - - -def fetch_item_details_single(media_id: int): - try: - with db.get_connection() as conn: - cursor = conn.cursor() - cursor.execute(""" - SELECT prompt, summary - FROM MediaModifications - WHERE media_id = ? - ORDER BY modification_date DESC - LIMIT 1 - """, (media_id,)) - prompt_summary_result = cursor.fetchone() - cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,)) - content_result = cursor.fetchone() - - prompt = prompt_summary_result[0] if prompt_summary_result else "" - summary = prompt_summary_result[1] if prompt_summary_result else "" - content = content_result[0] if content_result else "" - - return prompt, summary, content - except sqlite3.Error as e: - raise Exception(f"Error fetching item details: {e}") - - -def fetch_item_details(media_id: int): - try: - with db.get_connection() as conn: - cursor = conn.cursor() - cursor.execute(""" - SELECT prompt, summary - FROM MediaModifications - WHERE media_id = ? - ORDER BY modification_date DESC - LIMIT 1 - """, (media_id,)) - prompt_summary_result = cursor.fetchone() - cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,)) - content_result = cursor.fetchone() - - prompt = prompt_summary_result[0] if prompt_summary_result else "" - summary = prompt_summary_result[1] if prompt_summary_result else "" - content = content_result[0] if content_result else "" - - return content, prompt, summary - except sqlite3.Error as e: - logging.error(f"Error fetching item details: {e}") - return "", "", "" # Return empty strings if there's an error - - -def browse_items(search_query, search_type): - if search_type == 'Keyword': - results = fetch_items_by_keyword(search_query) - elif search_type == 'Content': - results = fetch_items_by_content(search_query) - else: - results = fetch_items_by_title_or_url(search_query, search_type) - return results - - -def update_dropdown(search_query, search_type): - results = browse_items(search_query, search_type) - item_options = [f"{item[1]} ({item[2]})" for item in results] - new_item_mapping = {f"{item[1]} ({item[2]})": item[0] for item in results} - print(f"Debug - Update Dropdown - New Item Mapping: {new_item_mapping}") - return gr.update(choices=item_options), new_item_mapping - - - -def get_media_id(selected_item, item_mapping): - return item_mapping.get(selected_item) - - -def update_detailed_view(item, item_mapping): - # Function to update the detailed view based on selected item - if item: - item_id = item_mapping.get(item) - if item_id: - content, prompt, summary = fetch_item_details(item_id) - if content or prompt or summary: - details_html = "

Details:

" - if prompt: - formatted_prompt = format_transcription(prompt) - details_html += f"

Prompt:

{formatted_prompt}

" - if summary: - formatted_summary = format_transcription(summary) - details_html += f"

Summary:

{formatted_summary}

" - # Format the transcription content for better readability - formatted_content = format_transcription(content) - #content_html = f"

Transcription:

{content}
" - content_html = f"

Transcription:

{formatted_content}
" - return details_html, content_html - else: - return "No details available.", "No details available." - else: - return "No item selected", "No item selected" - else: - return "No item selected", "No item selected" - - -def format_content(content): - # Format content using markdown - formatted_content = f"```\n{content}\n```" - return formatted_content - - -def update_prompt_dropdown(): - prompt_names = list_prompts() - return gr.update(choices=prompt_names) - - -def display_prompt_details(selected_prompt): - if selected_prompt: - prompts = update_user_prompt(selected_prompt) - if prompts["title"]: # Check if we have any details - details_str = f"

Details:

{prompts['details']}

" - system_str = f"

System:

{prompts['system_prompt']}

" - user_str = f"

User:

{prompts['user_prompt']}

" if prompts['user_prompt'] else "" - return details_str + system_str + user_str - return "No details available." - -def search_media_database(query: str) -> List[Tuple[int, str, str]]: - return browse_items(query, 'Title') - - -def load_media_content(media_id: int) -> dict: - try: - print(f"Debug - Load Media Content - Media ID: {media_id}") - item_details = fetch_item_details(media_id) - print(f"Debug - Load Media Content - Item Details: \n\n{item_details}\n\n\n\n") - - if isinstance(item_details, tuple) and len(item_details) == 3: - content, prompt, summary = item_details - else: - print(f"Debug - Load Media Content - Unexpected item_details format: \n\n{item_details}\n\n\n\n") - content, prompt, summary = "", "", "" - - return { - "content": content or "No content available", - "prompt": prompt or "No prompt available", - "summary": summary or "No summary available" - } - except Exception as e: - print(f"Debug - Load Media Content - Error: {str(e)}") - return {"content": "", "prompt": "", "summary": ""} - - -def error_handler(func): - @wraps(func) - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - error_message = f"Error in {func.__name__}: {str(e)}" - logging.error(f"{error_message}\n{traceback.format_exc()}") - return {"error": error_message, "details": traceback.format_exc()} - return wrapper - - -def create_chunking_inputs(): - chunk_text_by_words_checkbox = gr.Checkbox(label="Chunk Text by Words", value=False, visible=True) - max_words_input = gr.Number(label="Max Words", value=300, precision=0, visible=True) - chunk_text_by_sentences_checkbox = gr.Checkbox(label="Chunk Text by Sentences", value=False, visible=True) - max_sentences_input = gr.Number(label="Max Sentences", value=10, precision=0, visible=True) - chunk_text_by_paragraphs_checkbox = gr.Checkbox(label="Chunk Text by Paragraphs", value=False, visible=True) - max_paragraphs_input = gr.Number(label="Max Paragraphs", value=5, precision=0, visible=True) - chunk_text_by_tokens_checkbox = gr.Checkbox(label="Chunk Text by Tokens", value=False, visible=True) - max_tokens_input = gr.Number(label="Max Tokens", value=1000, precision=0, visible=True) - gr_semantic_chunk_long_file = gr.Checkbox(label="Semantic Chunking by Sentence similarity", value=False, visible=True) - gr_semantic_chunk_long_file_size = gr.Number(label="Max Chunk Size", value=2000, visible=True) - gr_semantic_chunk_long_file_overlap = gr.Number(label="Max Chunk Overlap Size", value=100, visible=True) - return [chunk_text_by_words_checkbox, max_words_input, chunk_text_by_sentences_checkbox, max_sentences_input, - chunk_text_by_paragraphs_checkbox, max_paragraphs_input, chunk_text_by_tokens_checkbox, max_tokens_input] - - - - - - - - -# -# End of miscellaneous unsorted functions -####################################################################################################################### -# -# Start of Video/Audio Transcription and Summarization Functions - -def create_introduction_tab(): - with (gr.TabItem("Introduction")): - gr.Markdown("# tldw: Your LLM-powered Research Multi-tool") - with gr.Row(): - with gr.Column(): - gr.Markdown("""### What can it do? - - Transcribe and summarize videos from URLs/Local files - - Transcribe and Summarize Audio files/Podcasts (URL/local file) - - Summarize articles from URLs/Local notes - - Ingest and summarize books(epub/PDF) - - Ingest and summarize research papers (PDFs - WIP) - - Search and display ingested content + summaries - - Create and manage custom prompts - - Chat with an LLM of your choice to generate content using the selected item + Prompts - - Keyword support for content search and display - - Export keywords/items to markdown/CSV(csv is wip) - - Import existing notes from Obsidian to the database (Markdown/txt files or a zip containing a collection of files) - - View and manage chat history - - Writing Tools: Grammar & Style check, Tone Analyzer & Editor, more planned... - - RAG (Retrieval-Augmented Generation) support for content generation(think about asking questions about your entire library of items) - - More features planned... - - All powered by your choice of LLM. - - Currently supports: Local-LLM(llamafile-server), OpenAI, Anthropic, Cohere, Groq, DeepSeek, OpenRouter, Llama.cpp, Kobold, Ooba, Tabbyapi, VLLM and more to come... - - All data is stored locally in a SQLite database for easy access and management. - - No trackers (Gradio has some analytics but it's disabled here...) - - No ads, no tracking, no BS. Just you and your content. - - Open-source and free to use. Contributions welcome! - - If you have any thoughts or feedback, please let me know on github or via email. - """) - gr.Markdown("""Follow this project at [tl/dw: Too Long, Didn't Watch - Your Personal Research Multi-Tool - GitHub](https://github.com/rmusser01/tldw)""") - with gr.Column(): - gr.Markdown("""### How to use: - ##### Quick Start: Just click on the appropriate tab for what you're trying to do and fill in the required fields. Click "Process