# Gradio_Related.py ######################################### # Gradio UI Functions Library # This library is used to hold all UI-related functions for Gradio. # I fucking hate Gradio. # ##### # Functions: # # download_audio_file(url, save_path) # process_audio( # process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None) # # ######################################### # # Built-In Imports import glob import html import math import re import shutil import tempfile import uuid import zipfile from datetime import datetime import json import logging import os.path from pathlib import Path import sqlite3 from time import sleep from typing import Dict, List, Tuple, Optional import traceback from functools import wraps # # Import 3rd-Party Libraries import pypandoc import yt_dlp import gradio as gr # # Local Imports from App_Function_Libraries.Article_Summarization_Lib import scrape_and_summarize_multiple from App_Function_Libraries.Audio_Files import process_audio_files, process_podcast, download_youtube_audio from App_Function_Libraries.Chunk_Lib import improved_chunking_process from App_Function_Libraries.PDF_Ingestion_Lib import process_and_cleanup_pdf, extract_text_and_format_from_pdf, \ extract_metadata_from_pdf from App_Function_Libraries.Local_LLM_Inference_Engine_Lib import local_llm_gui_function from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \ summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm, \ summarize_with_ollama from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai, summarize_with_cohere, \ summarize_with_anthropic, summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, \ summarize_with_huggingface, perform_summarization, save_transcription_and_summary, \ perform_transcription, summarize_chunk from App_Function_Libraries.SQLite_DB import update_media_content, list_prompts, search_and_display, db, DatabaseError, \ fetch_prompt_details, keywords_browser_interface, add_keyword, delete_keyword, \ export_keywords_to_csv, add_media_to_database, import_obsidian_note_to_db, add_prompt, \ delete_chat_message, update_chat_message, add_chat_message, get_chat_messages, search_chat_conversations, \ create_chat_conversation, save_chat_history_to_database, view_database, get_transcripts, get_trashed_items, \ user_delete_item, empty_trash, create_automated_backup, backup_dir, db_path, add_or_update_prompt, \ load_prompt_details, load_preset_prompts, insert_prompt_to_db, delete_prompt, search_and_display_items, \ get_conversation_name from App_Function_Libraries.Utils import sanitize_filename, extract_text_from_segments, create_download_directory, \ convert_to_seconds, load_comprehensive_config, safe_read_file, downloaded_files, generate_unique_identifier, \ generate_unique_filename from App_Function_Libraries.Video_DL_Ingestion_Lib import parse_and_expand_urls, \ generate_timestamped_url, extract_metadata, download_video # ####################################################################################################################### # Function Definitions # whisper_models = ["small", "medium", "small.en", "medium.en", "medium", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2", "distil-medium.en", "distil-small.en"] custom_prompt_input = None server_mode = False share_public = False custom_prompt_summarize_bulleted_notes = (""" You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST] **Bulleted Note Creation Guidelines** **Headings**: - Based on referenced topics, not categories like quotes or terms - Surrounded by **bold** formatting - Not listed as bullet points - No space between headings and list items underneath **Emphasis**: - **Important terms** set in bold font - **Text ending in a colon**: also bolded **Review**: - Ensure adherence to specified format - Do not reference these instructions in your response.[INST] {{ .Prompt }} [/INST] """) def gradio_download_youtube_video(url): try: # Determine ffmpeg path based on the operating system. ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg' # Create a temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Extract information about the video with yt_dlp.YoutubeDL({'quiet': True}) as ydl: info_dict = ydl.extract_info(url, download=False) sanitized_title = sanitize_filename(info_dict['title']) original_ext = info_dict['ext'] # Setup the temporary filename temp_file_path = Path(temp_dir) / f"{sanitized_title}.{original_ext}" # Initialize yt-dlp with generic options and the output template ydl_opts = { 'format': 'bestvideo+bestaudio/best', 'ffmpeg_location': ffmpeg_path, 'outtmpl': str(temp_file_path), 'noplaylist': True, 'quiet': True } # Execute yt-dlp to download the video with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) # Final check to ensure file exists if not temp_file_path.exists(): raise FileNotFoundError(f"Expected file was not found: {temp_file_path}") # Create a persistent directory for the download if it doesn't exist persistent_dir = Path("downloads") persistent_dir.mkdir(exist_ok=True) # Move the file from the temporary directory to the persistent directory persistent_file_path = persistent_dir / f"{sanitized_title}.{original_ext}" shutil.move(str(temp_file_path), str(persistent_file_path)) # Add the file to the list of downloaded files downloaded_files.append(str(persistent_file_path)) return str(persistent_file_path), f"Video downloaded successfully: {sanitized_title}.{original_ext}" except Exception as e: return None, f"Error downloading video: {str(e)}" def format_transcription(content): # Replace '\n' with actual line breaks content = content.replace('\\n', '\n') # Split the content by newlines first lines = content.split('\n') formatted_lines = [] for line in lines: # Add extra space after periods for better readability line = line.replace('.', '. ').replace('. ', '. ') # Split into sentences using a more comprehensive regex sentences = re.split('(?<=[.!?]) +', line) # Trim whitespace from each sentence and add a line break formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()] # Join the formatted sentences formatted_lines.append(' '.join(formatted_sentences)) # Join the lines with HTML line breaks formatted_content = '
'.join(formatted_lines) return formatted_content def format_file_path(file_path, fallback_path=None): if file_path and os.path.exists(file_path): logging.debug(f"File exists: {file_path}") return file_path elif fallback_path and os.path.exists(fallback_path): logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}") return fallback_path else: logging.debug(f"File does not exist: {file_path}. No fallback path available.") return None def search_media(query, fields, keyword, page): try: results = search_and_display(query, fields, keyword, page) return results except Exception as e: logger = logging.getLogger() logger.error(f"Error searching media: {e}") return str(e) # Sample data prompts_category_1 = [ "What are the key points discussed in the video?", "Summarize the main arguments made by the speaker.", "Describe the conclusions of the study presented." ] prompts_category_2 = [ "How does the proposed solution address the problem?", "What are the implications of the findings?", "Can you explain the theory behind the observed phenomenon?" ] all_prompts = prompts_category_1 + prompts_category_2 # Handle prompt selection def handle_prompt_selection(prompt): return f"You selected: {prompt}" # FIXME - Dead code? # def display_details(media_id): # if media_id: # details = display_item_details(media_id) # details_html = "" # for detail in details: # details_html += f"

Prompt:

{detail[0]}

" # details_html += f"

Summary:

{detail[1]}

" # # # Format the transcription # formatted_transcription = format_transcription(detail[2]) # # # Use
 tag with style for better formatting
#             details_html += f"

Transcription:

{formatted_transcription}

" # # return details_html # return "No details available." def fetch_items_by_title_or_url(search_query: str, search_type: str): try: with db.get_connection() as conn: cursor = conn.cursor() if search_type == 'Title': cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',)) elif search_type == 'URL': cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',)) results = cursor.fetchall() return results except sqlite3.Error as e: raise DatabaseError(f"Error fetching items by {search_type}: {e}") def fetch_items_by_keyword(search_query: str): try: with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT m.id, m.title, m.url FROM Media m JOIN MediaKeywords mk ON m.id = mk.media_id JOIN Keywords k ON mk.keyword_id = k.id WHERE k.keyword LIKE ? """, (f'%{search_query}%',)) results = cursor.fetchall() return results except sqlite3.Error as e: raise DatabaseError(f"Error fetching items by keyword: {e}") def fetch_items_by_content(search_query: str): try: with db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT id, title, url FROM Media WHERE content LIKE ?", (f'%{search_query}%',)) results = cursor.fetchall() return results except sqlite3.Error as e: raise DatabaseError(f"Error fetching items by content: {e}") def fetch_item_details_single(media_id: int): try: with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT prompt, summary FROM MediaModifications WHERE media_id = ? ORDER BY modification_date DESC LIMIT 1 """, (media_id,)) prompt_summary_result = cursor.fetchone() cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,)) content_result = cursor.fetchone() prompt = prompt_summary_result[0] if prompt_summary_result else "" summary = prompt_summary_result[1] if prompt_summary_result else "" content = content_result[0] if content_result else "" return prompt, summary, content except sqlite3.Error as e: raise Exception(f"Error fetching item details: {e}") def fetch_item_details(media_id: int): try: with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT prompt, summary FROM MediaModifications WHERE media_id = ? ORDER BY modification_date DESC LIMIT 1 """, (media_id,)) prompt_summary_result = cursor.fetchone() cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,)) content_result = cursor.fetchone() prompt = prompt_summary_result[0] if prompt_summary_result else "" summary = prompt_summary_result[1] if prompt_summary_result else "" content = content_result[0] if content_result else "" return content, prompt, summary except sqlite3.Error as e: logging.error(f"Error fetching item details: {e}") return "", "", "" # Return empty strings if there's an error def browse_items(search_query, search_type): if search_type == 'Keyword': results = fetch_items_by_keyword(search_query) elif search_type == 'Content': results = fetch_items_by_content(search_query) else: results = fetch_items_by_title_or_url(search_query, search_type) return results def update_dropdown(search_query, search_type): results = browse_items(search_query, search_type) item_options = [f"{item[1]} ({item[2]})" for item in results] new_item_mapping = {f"{item[1]} ({item[2]})": item[0] for item in results} print(f"Debug - Update Dropdown - New Item Mapping: {new_item_mapping}") return gr.update(choices=item_options), new_item_mapping def get_media_id(selected_item, item_mapping): return item_mapping.get(selected_item) def update_detailed_view(item, item_mapping): # Function to update the detailed view based on selected item if item: item_id = item_mapping.get(item) if item_id: content, prompt, summary = fetch_item_details(item_id) if content or prompt or summary: details_html = "

Details:

" if prompt: formatted_prompt = format_transcription(prompt) details_html += f"

Prompt:

{formatted_prompt}

" if summary: formatted_summary = format_transcription(summary) details_html += f"

Summary:

{formatted_summary}

" # Format the transcription content for better readability formatted_content = format_transcription(content) #content_html = f"

Transcription:

{content}
" content_html = f"

Transcription:

{formatted_content}
" return details_html, content_html else: return "No details available.", "No details available." else: return "No item selected", "No item selected" else: return "No item selected", "No item selected" def format_content(content): # Format content using markdown formatted_content = f"```\n{content}\n```" return formatted_content def update_prompt_dropdown(): prompt_names = list_prompts() return gr.update(choices=prompt_names) def display_prompt_details(selected_prompt): if selected_prompt: prompts = update_user_prompt(selected_prompt) if prompts["title"]: # Check if we have any details details_str = f"

Details:

{prompts['details']}

" system_str = f"

System:

{prompts['system_prompt']}

" user_str = f"

User:

{prompts['user_prompt']}

" if prompts['user_prompt'] else "" return details_str + system_str + user_str return "No details available." def search_media_database(query: str) -> List[Tuple[int, str, str]]: return browse_items(query, 'Title') def load_media_content(media_id: int) -> dict: try: print(f"Debug - Load Media Content - Media ID: {media_id}") item_details = fetch_item_details(media_id) print(f"Debug - Load Media Content - Item Details: \n\n{item_details}\n\n\n\n") if isinstance(item_details, tuple) and len(item_details) == 3: content, prompt, summary = item_details else: print(f"Debug - Load Media Content - Unexpected item_details format: \n\n{item_details}\n\n\n\n") content, prompt, summary = "", "", "" return { "content": content or "No content available", "prompt": prompt or "No prompt available", "summary": summary or "No summary available" } except Exception as e: print(f"Debug - Load Media Content - Error: {str(e)}") return {"content": "", "prompt": "", "summary": ""} def error_handler(func): @wraps(func) def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: error_message = f"Error in {func.__name__}: {str(e)}" logging.error(f"{error_message}\n{traceback.format_exc()}") return {"error": error_message, "details": traceback.format_exc()} return wrapper def create_chunking_inputs(): chunk_text_by_words_checkbox = gr.Checkbox(label="Chunk Text by Words", value=False, visible=True) max_words_input = gr.Number(label="Max Words", value=300, precision=0, visible=True) chunk_text_by_sentences_checkbox = gr.Checkbox(label="Chunk Text by Sentences", value=False, visible=True) max_sentences_input = gr.Number(label="Max Sentences", value=10, precision=0, visible=True) chunk_text_by_paragraphs_checkbox = gr.Checkbox(label="Chunk Text by Paragraphs", value=False, visible=True) max_paragraphs_input = gr.Number(label="Max Paragraphs", value=5, precision=0, visible=True) chunk_text_by_tokens_checkbox = gr.Checkbox(label="Chunk Text by Tokens", value=False, visible=True) max_tokens_input = gr.Number(label="Max Tokens", value=1000, precision=0, visible=True) gr_semantic_chunk_long_file = gr.Checkbox(label="Semantic Chunking by Sentence similarity", value=False, visible=True) gr_semantic_chunk_long_file_size = gr.Number(label="Max Chunk Size", value=2000, visible=True) gr_semantic_chunk_long_file_overlap = gr.Number(label="Max Chunk Overlap Size", value=100, visible=True) return [chunk_text_by_words_checkbox, max_words_input, chunk_text_by_sentences_checkbox, max_sentences_input, chunk_text_by_paragraphs_checkbox, max_paragraphs_input, chunk_text_by_tokens_checkbox, max_tokens_input] # # End of miscellaneous unsorted functions ####################################################################################################################### # # Start of Video/Audio Transcription and Summarization Functions def create_introduction_tab(): with (gr.TabItem("Introduction")): gr.Markdown("# tldw: Your LLM-powered Research Multi-tool") with gr.Row(): with gr.Column(): gr.Markdown("""### What can it do? - Transcribe and summarize videos from URLs/Local files - Transcribe and Summarize Audio files/Podcasts (URL/local file) - Summarize articles from URLs/Local notes - Ingest and summarize books(epub/PDF) - Ingest and summarize research papers (PDFs - WIP) - Search and display ingested content + summaries - Create and manage custom prompts - Chat with an LLM of your choice to generate content using the selected item + Prompts - Keyword support for content search and display - Export keywords/items to markdown/CSV(csv is wip) - Import existing notes from Obsidian to the database (Markdown/txt files or a zip containing a collection of files) - View and manage chat history - Writing Tools: Grammar & Style check, Tone Analyzer & Editor, more planned... - RAG (Retrieval-Augmented Generation) support for content generation(think about asking questions about your entire library of items) - More features planned... - All powered by your choice of LLM. - Currently supports: Local-LLM(llamafile-server), OpenAI, Anthropic, Cohere, Groq, DeepSeek, OpenRouter, Llama.cpp, Kobold, Ooba, Tabbyapi, VLLM and more to come... - All data is stored locally in a SQLite database for easy access and management. - No trackers (Gradio has some analytics but it's disabled here...) - No ads, no tracking, no BS. Just you and your content. - Open-source and free to use. Contributions welcome! - If you have any thoughts or feedback, please let me know on github or via email. """) gr.Markdown("""Follow this project at [tl/dw: Too Long, Didn't Watch - Your Personal Research Multi-Tool - GitHub](https://github.com/rmusser01/tldw)""") with gr.Column(): gr.Markdown("""### How to use: ##### Quick Start: Just click on the appropriate tab for what you're trying to do and fill in the required fields. Click "Process