# Gradio_Related.py
#########################################
# Gradio UI Functions Library
# This library is used to hold all UI-related functions for Gradio.
# I fucking hate Gradio.
#
#####
# Functions:
#
# download_audio_file(url, save_path)
# process_audio(
# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None)
#
#
#########################################
#
# Built-In Imports
import glob
import html
import math
import re
import shutil
import tempfile
import uuid
import zipfile
from datetime import datetime, time
import json
import logging
import os.path
from pathlib import Path
import sqlite3
from typing import Dict, List, Tuple, Optional
import traceback
from functools import wraps
import pypandoc
#
# Import 3rd-Party Libraries
import yt_dlp
import gradio as gr
#
# Local Imports
from App_Function_Libraries.Article_Summarization_Lib import scrape_and_summarize_multiple
from App_Function_Libraries.Audio_Files import process_audio_files, process_podcast, download_youtube_audio
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
from App_Function_Libraries.PDF_Ingestion_Lib import process_and_cleanup_pdf
from App_Function_Libraries.Local_LLM_Inference_Engine_Lib import local_llm_gui_function
from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai, summarize_with_cohere, \
summarize_with_anthropic, summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, \
summarize_with_huggingface, perform_summarization, save_transcription_and_summary, \
perform_transcription, summarize_chunk
from App_Function_Libraries.SQLite_DB import update_media_content, list_prompts, search_and_display, db, DatabaseError, \
fetch_prompt_details, keywords_browser_interface, add_keyword, delete_keyword, \
export_keywords_to_csv, add_media_to_database, import_obsidian_note_to_db, add_prompt, \
delete_chat_message, update_chat_message, add_chat_message, get_chat_messages, search_chat_conversations, \
create_chat_conversation, save_chat_history_to_database, view_database, get_transcripts, get_trashed_items, \
user_delete_item, empty_trash, create_automated_backup, backup_dir, db_path, add_or_update_prompt, \
load_prompt_details, load_preset_prompts, insert_prompt_to_db, delete_prompt, search_and_display_items
from App_Function_Libraries.Utils import sanitize_filename, extract_text_from_segments, create_download_directory, \
convert_to_seconds, load_comprehensive_config, safe_read_file, downloaded_files
from App_Function_Libraries.Video_DL_Ingestion_Lib import parse_and_expand_urls, \
generate_timestamped_url, extract_metadata, download_video
#
#######################################################################################################################
# Function Definitions
#
# I know this is bad, I don't care, this key is set to expire on Aug 19. Until then, it is what it is.
MISTRAL_TOKEN = "p3hw1VRckQl86OjeOtvaOckMfAaernxz"
whisper_models = ["small", "medium", "small.en", "medium.en", "medium", "large", "large-v1", "large-v2", "large-v3",
"distil-large-v2", "distil-medium.en", "distil-small.en"]
custom_prompt_input = None
server_mode = False
share_public = False
custom_prompt_summarize_bulleted_notes = ("""
You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
**Bulleted Note Creation Guidelines**
**Headings**:
- Based on referenced topics, not categories like quotes or terms
- Surrounded by **bold** formatting
- Not listed as bullet points
- No space between headings and list items underneath
**Emphasis**:
- **Important terms** set in bold font
- **Text ending in a colon**: also bolded
**Review**:
- Ensure adherence to specified format
- Do not reference these instructions in your response.[INST] {{ .Prompt }} [/INST]
""")
def gradio_download_youtube_video(url):
try:
# Determine ffmpeg path based on the operating system.
ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Extract information about the video
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
info_dict = ydl.extract_info(url, download=False)
sanitized_title = sanitize_filename(info_dict['title'])
original_ext = info_dict['ext']
# Setup the temporary filename
temp_file_path = Path(temp_dir) / f"{sanitized_title}.{original_ext}"
# Initialize yt-dlp with generic options and the output template
ydl_opts = {
'format': 'bestvideo+bestaudio/best',
'ffmpeg_location': ffmpeg_path,
'outtmpl': str(temp_file_path),
'noplaylist': True,
'quiet': True
}
# Execute yt-dlp to download the video
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# Final check to ensure file exists
if not temp_file_path.exists():
raise FileNotFoundError(f"Expected file was not found: {temp_file_path}")
# Create a persistent directory for the download if it doesn't exist
persistent_dir = Path("downloads")
persistent_dir.mkdir(exist_ok=True)
# Move the file from the temporary directory to the persistent directory
persistent_file_path = persistent_dir / f"{sanitized_title}.{original_ext}"
shutil.move(str(temp_file_path), str(persistent_file_path))
# Add the file to the list of downloaded files
downloaded_files.append(str(persistent_file_path))
return str(persistent_file_path), f"Video downloaded successfully: {sanitized_title}.{original_ext}"
except Exception as e:
return None, f"Error downloading video: {str(e)}"
def format_transcription(content):
# Replace '\n' with actual line breaks
content = content.replace('\\n', '\n')
# Split the content by newlines first
lines = content.split('\n')
formatted_lines = []
for line in lines:
# Add extra space after periods for better readability
line = line.replace('.', '. ').replace('. ', '. ')
# Split into sentences using a more comprehensive regex
sentences = re.split('(?<=[.!?]) +', line)
# Trim whitespace from each sentence and add a line break
formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# Join the formatted sentences
formatted_lines.append(' '.join(formatted_sentences))
# Join the lines with HTML line breaks
formatted_content = '
'.join(formatted_lines)
return formatted_content
def format_file_path(file_path, fallback_path=None):
if file_path and os.path.exists(file_path):
logging.debug(f"File exists: {file_path}")
return file_path
elif fallback_path and os.path.exists(fallback_path):
logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
return fallback_path
else:
logging.debug(f"File does not exist: {file_path}. No fallback path available.")
return None
def search_media(query, fields, keyword, page):
try:
results = search_and_display(query, fields, keyword, page)
return results
except Exception as e:
logger = logging.getLogger()
logger.error(f"Error searching media: {e}")
return str(e)
# Sample data
prompts_category_1 = [
"What are the key points discussed in the video?",
"Summarize the main arguments made by the speaker.",
"Describe the conclusions of the study presented."
]
prompts_category_2 = [
"How does the proposed solution address the problem?",
"What are the implications of the findings?",
"Can you explain the theory behind the observed phenomenon?"
]
all_prompts = prompts_category_1 + prompts_category_2
# Handle prompt selection
def handle_prompt_selection(prompt):
return f"You selected: {prompt}"
# FIXME - Dead code?
# def display_details(media_id):
# if media_id:
# details = display_item_details(media_id)
# details_html = ""
# for detail in details:
# details_html += f"
Prompt:
{detail[0]}
"
# details_html += f"Summary:
{detail[1]}
"
#
# # Format the transcription
# formatted_transcription = format_transcription(detail[2])
#
# # Use tag with style for better formatting
# details_html += f"Transcription:
{formatted_transcription}
"
#
# return details_html
# return "No details available."
def fetch_items_by_title_or_url(search_query: str, search_type: str):
try:
with db.get_connection() as conn:
cursor = conn.cursor()
if search_type == 'Title':
cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',))
elif search_type == 'URL':
cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',))
results = cursor.fetchall()
return results
except sqlite3.Error as e:
raise DatabaseError(f"Error fetching items by {search_type}: {e}")
def fetch_items_by_keyword(search_query: str):
try:
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT m.id, m.title, m.url
FROM Media m
JOIN MediaKeywords mk ON m.id = mk.media_id
JOIN Keywords k ON mk.keyword_id = k.id
WHERE k.keyword LIKE ?
""", (f'%{search_query}%',))
results = cursor.fetchall()
return results
except sqlite3.Error as e:
raise DatabaseError(f"Error fetching items by keyword: {e}")
def fetch_items_by_content(search_query: str):
try:
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, title, url FROM Media WHERE content LIKE ?", (f'%{search_query}%',))
results = cursor.fetchall()
return results
except sqlite3.Error as e:
raise DatabaseError(f"Error fetching items by content: {e}")
def fetch_item_details_single(media_id: int):
try:
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT prompt, summary
FROM MediaModifications
WHERE media_id = ?
ORDER BY modification_date DESC
LIMIT 1
""", (media_id,))
prompt_summary_result = cursor.fetchone()
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
content_result = cursor.fetchone()
prompt = prompt_summary_result[0] if prompt_summary_result else ""
summary = prompt_summary_result[1] if prompt_summary_result else ""
content = content_result[0] if content_result else ""
return prompt, summary, content
except sqlite3.Error as e:
raise Exception(f"Error fetching item details: {e}")
def fetch_item_details(media_id: int):
try:
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT prompt, summary
FROM MediaModifications
WHERE media_id = ?
ORDER BY modification_date DESC
LIMIT 1
""", (media_id,))
prompt_summary_result = cursor.fetchone()
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
content_result = cursor.fetchone()
prompt = prompt_summary_result[0] if prompt_summary_result else ""
summary = prompt_summary_result[1] if prompt_summary_result else ""
content = content_result[0] if content_result else ""
return content, prompt, summary
except sqlite3.Error as e:
logging.error(f"Error fetching item details: {e}")
return "", "", "" # Return empty strings if there's an error
def browse_items(search_query, search_type):
if search_type == 'Keyword':
results = fetch_items_by_keyword(search_query)
elif search_type == 'Content':
results = fetch_items_by_content(search_query)
else:
results = fetch_items_by_title_or_url(search_query, search_type)
return results
def update_dropdown(search_query, search_type):
results = browse_items(search_query, search_type)
item_options = [f"{item[1]} ({item[2]})" for item in results]
new_item_mapping = {f"{item[1]} ({item[2]})": item[0] for item in results}
print(f"Debug - Update Dropdown - New Item Mapping: {new_item_mapping}")
return gr.update(choices=item_options), new_item_mapping
def get_media_id(selected_item, item_mapping):
return item_mapping.get(selected_item)
def update_detailed_view(item, item_mapping):
# Function to update the detailed view based on selected item
if item:
item_id = item_mapping.get(item)
if item_id:
content, prompt, summary = fetch_item_details(item_id)
if content or prompt or summary:
details_html = "Details:
"
if prompt:
formatted_prompt = format_transcription(prompt)
details_html += f"Prompt:
{formatted_prompt}"
if summary:
formatted_summary = format_transcription(summary)
details_html += f"Summary:
{formatted_summary}"
# Format the transcription content for better readability
formatted_content = format_transcription(content)
#content_html = f"Transcription:
{content}
"
content_html = f"Transcription:
{formatted_content}
"
return details_html, content_html
else:
return "No details available.", "No details available."
else:
return "No item selected", "No item selected"
else:
return "No item selected", "No item selected"
def format_content(content):
# Format content using markdown
formatted_content = f"```\n{content}\n```"
return formatted_content
def update_prompt_dropdown():
prompt_names = list_prompts()
return gr.update(choices=prompt_names)
def display_prompt_details(selected_prompt):
if selected_prompt:
prompts = update_user_prompt(selected_prompt)
if prompts["title"]: # Check if we have any details
details_str = f"Details:
{prompts['details']}
"
system_str = f"System:
{prompts['system_prompt']}
"
user_str = f"User:
{prompts['user_prompt']}
" if prompts['user_prompt'] else ""
return details_str + system_str + user_str
return "No details available."
def search_media_database(query: str) -> List[Tuple[int, str, str]]:
return browse_items(query, 'Title')
def load_media_content(media_id: int) -> dict:
try:
print(f"Debug - Load Media Content - Media ID: {media_id}")
item_details = fetch_item_details(media_id)
print(f"Debug - Load Media Content - Item Details: \n\n{item_details}\n\n\n\n")
if isinstance(item_details, tuple) and len(item_details) == 3:
content, prompt, summary = item_details
else:
print(f"Debug - Load Media Content - Unexpected item_details format: \n\n{item_details}\n\n\n\n")
content, prompt, summary = "", "", ""
return {
"content": content or "No content available",
"prompt": prompt or "No prompt available",
"summary": summary or "No summary available"
}
except Exception as e:
print(f"Debug - Load Media Content - Error: {str(e)}")
return {"content": "", "prompt": "", "summary": ""}
def error_handler(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
error_message = f"Error in {func.__name__}: {str(e)}"
logging.error(f"{error_message}\n{traceback.format_exc()}")
return {"error": error_message, "details": traceback.format_exc()}
return wrapper
def create_chunking_inputs():
chunk_text_by_words_checkbox = gr.Checkbox(label="Chunk Text by Words", value=False, visible=True)
max_words_input = gr.Number(label="Max Words", value=300, precision=0, visible=True)
chunk_text_by_sentences_checkbox = gr.Checkbox(label="Chunk Text by Sentences", value=False, visible=True)
max_sentences_input = gr.Number(label="Max Sentences", value=10, precision=0, visible=True)
chunk_text_by_paragraphs_checkbox = gr.Checkbox(label="Chunk Text by Paragraphs", value=False, visible=True)
max_paragraphs_input = gr.Number(label="Max Paragraphs", value=5, precision=0, visible=True)
chunk_text_by_tokens_checkbox = gr.Checkbox(label="Chunk Text by Tokens", value=False, visible=True)
max_tokens_input = gr.Number(label="Max Tokens", value=1000, precision=0, visible=True)
gr_semantic_chunk_long_file = gr.Checkbox(label="Semantic Chunking by Sentence similarity", value=False, visible=True)
gr_semantic_chunk_long_file_size = gr.Number(label="Max Chunk Size", value=2000, visible=True)
gr_semantic_chunk_long_file_overlap = gr.Number(label="Max Chunk Overlap Size", value=100, visible=True)
return [chunk_text_by_words_checkbox, max_words_input, chunk_text_by_sentences_checkbox, max_sentences_input,
chunk_text_by_paragraphs_checkbox, max_paragraphs_input, chunk_text_by_tokens_checkbox, max_tokens_input]
#
# End of miscellaneous unsorted functions
#######################################################################################################################
#
# Start of Video/Audio Transcription and Summarization Functions
def create_introduction_tab():
with (gr.TabItem("Introduction")):
gr.Markdown("# tldw: Your LLM-powered Research Multi-tool")
with gr.Row():
with gr.Column():
gr.Markdown("""### What can it do?
- Transcribe and summarize videos from URLs/Local files
- Transcribe and Summarize Audio files/Podcasts (URL/local file)
- Summarize articles from URLs/Local notes
- Ingest and summarize books(epub/PDF)
- Ingest and summarize research papers (PDFs - WIP)
- Search and display ingested content + summaries
- Create and manage custom prompts
- Chat with an LLM of your choice to generate content using the selected item + Prompts
- Keyword support for content search and display
- Export keywords/items to markdown/CSV(csv is wip)
- Import existing notes from Obsidian to the database (Markdown/txt files or a zip containing a collection of files)
- View and manage chat history
- Writing Tools: Grammar & Style check, Tone Analyzer & Editor, more planned...
- RAG (Retrieval-Augmented Generation) support for content generation(think about asking questions about your entire library of items)
- More features planned...
- All powered by your choice of LLM.
- Currently supports: Local-LLM(llamafile-server), OpenAI, Anthropic, Cohere, Groq, DeepSeek, OpenRouter, Llama.cpp, Kobold, Ooba, Tabbyapi, VLLM and more to come...
- All data is stored locally in a SQLite database for easy access and management.
- No trackers (Gradio has some analytics but it's disabled here...)
- No ads, no tracking, no BS. Just you and your content.
- Open-source and free to use. Contributions welcome!
- If you have any thoughts or feedback, please let me know on github or via email.
""")
gr.Markdown("""Follow this project at [tl/dw: Too Long, Didn't Watch - Your Personal Research Multi-Tool - GitHub](https://github.com/rmusser01/tldw)""")
with gr.Column():
gr.Markdown("""### How to use:
##### Quick Start: Just click on the appropriate tab for what you're trying to do and fill in the required fields. Click "Process