oceansweep commited on
Commit
45e1f81
1 Parent(s): 354deab

Upload 28 files

Browse files
App_Function_Libraries/Books/Book_Ingestion_Lib.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Book_Ingestion_Lib.py
2
+ #########################################
3
+ # Library to hold functions for ingesting book files.#
4
+ #
5
+ ####################
6
+ # Function List
7
+ #
8
+ # 1. ingest_text_file(file_path, title=None, author=None, keywords=None):
9
+ # 2.
10
+ #
11
+ #
12
+ ####################
13
+ #
14
+ # Import necessary libraries
15
+ import os
16
+ import re
17
+ from datetime import datetime
18
+ import logging
19
+
20
+ import ebooklib
21
+ from bs4 import BeautifulSoup
22
+ from ebooklib import epub
23
+ #
24
+ # Import Local
25
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
26
+ #
27
+ #######################################################################################################################
28
+ # Function Definitions
29
+ #
30
+
31
+
32
+
33
+ def read_epub(file_path):
34
+ """Read and extract text from an EPUB file."""
35
+ book = epub.read_epub(file_path)
36
+ chapters = []
37
+ for item in book.get_items():
38
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
39
+ chapters.append(item.get_content())
40
+
41
+ text = ""
42
+ for html_content in chapters:
43
+ soup = BeautifulSoup(html_content, 'html.parser')
44
+ text += soup.get_text() + "\n\n"
45
+ return text
46
+
47
+
48
+ # Ingest a text file into the database with Title/Author/Keywords
49
+ def extract_epub_metadata(content):
50
+ title_match = re.search(r'Title:\s*(.*?)\n', content)
51
+ author_match = re.search(r'Author:\s*(.*?)\n', content)
52
+
53
+ title = title_match.group(1) if title_match else None
54
+ author = author_match.group(1) if author_match else None
55
+
56
+ return title, author
57
+
58
+
59
+ def ingest_text_file(file_path, title=None, author=None, keywords=None):
60
+ try:
61
+ with open(file_path, 'r', encoding='utf-8') as file:
62
+ content = file.read()
63
+
64
+ # Check if it's a converted epub and extract metadata if so
65
+ if 'epub_converted' in (keywords or ''):
66
+ extracted_title, extracted_author = extract_epub_metadata(content)
67
+ title = title or extracted_title
68
+ author = author or extracted_author
69
+
70
+ # If title is still not provided, use the filename without extension
71
+ if not title:
72
+ title = os.path.splitext(os.path.basename(file_path))[0]
73
+
74
+ # If author is still not provided, set it to 'Unknown'
75
+ if not author:
76
+ author = 'Unknown'
77
+
78
+ # If keywords are not provided, use a default keyword
79
+ if not keywords:
80
+ keywords = 'text_file,epub_converted'
81
+ else:
82
+ keywords = f'text_file,epub_converted,{keywords}'
83
+
84
+ # Add the text file to the database
85
+ add_media_with_keywords(
86
+ url=file_path,
87
+ title=title,
88
+ media_type='document',
89
+ content=content,
90
+ keywords=keywords,
91
+ prompt='No prompt for text files',
92
+ summary='No summary for text files',
93
+ transcription_model='None',
94
+ author=author,
95
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
96
+ )
97
+
98
+ return f"Text file '{title}' by {author} ingested successfully."
99
+ except Exception as e:
100
+ logging.error(f"Error ingesting text file: {str(e)}")
101
+ return f"Error ingesting text file: {str(e)}"
102
+
103
+
104
+ def ingest_folder(folder_path, keywords=None):
105
+ results = []
106
+ for filename in os.listdir(folder_path):
107
+ if filename.lower().endswith('.txt'):
108
+ file_path = os.path.join(folder_path, filename)
109
+ result = ingest_text_file(file_path, keywords=keywords)
110
+ results.append(result)
111
+
112
+
113
+ def epub_to_markdown(epub_path):
114
+ book = epub.read_epub(epub_path)
115
+ markdown_content = "# Table of Contents\n\n"
116
+ chapters = []
117
+
118
+ # Extract and format the table of contents
119
+ toc = book.toc
120
+ for item in toc:
121
+ if isinstance(item, tuple):
122
+ section, children = item
123
+ level = 1
124
+ markdown_content += format_toc_item(section, level)
125
+ for child in children:
126
+ markdown_content += format_toc_item(child, level + 1)
127
+ else:
128
+ markdown_content += format_toc_item(item, 1)
129
+
130
+ markdown_content += "\n---\n\n"
131
+
132
+ # Process each chapter
133
+ for item in book.get_items():
134
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
135
+ chapter_content = item.get_content().decode('utf-8')
136
+ soup = BeautifulSoup(chapter_content, 'html.parser')
137
+
138
+ # Extract chapter title
139
+ title = soup.find(['h1', 'h2', 'h3'])
140
+ if title:
141
+ chapter_title = title.get_text()
142
+ markdown_content += f"# {chapter_title}\n\n"
143
+
144
+ # Process chapter content
145
+ for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
146
+ if elem.name.startswith('h'):
147
+ level = int(elem.name[1])
148
+ markdown_content += f"{'#' * level} {elem.get_text()}\n\n"
149
+ elif elem.name == 'p':
150
+ markdown_content += f"{elem.get_text()}\n\n"
151
+ elif elem.name in ['ul', 'ol']:
152
+ for li in elem.find_all('li'):
153
+ markdown_content += f"- {li.get_text()}\n"
154
+ markdown_content += "\n"
155
+
156
+ markdown_content += "---\n\n"
157
+
158
+ return markdown_content
159
+
160
+
161
+ def format_toc_item(item, level):
162
+ return f"{' ' * (level - 1)}- [{item.title}](#{slugify(item.title)})\n"
163
+
164
+
165
+ def slugify(text):
166
+ return re.sub(r'[\W_]+', '-', text.lower())
167
+
168
+ #
169
+ # End of Function Definitions
170
+ #######################################################################################################################
App_Function_Libraries/Books/__init__.py ADDED
File without changes
App_Function_Libraries/DB/DB_Manager.py CHANGED
@@ -5,8 +5,8 @@
5
  import configparser
6
  import os
7
  import logging
8
- from typing import Tuple, List, Union, Dict
9
  import time
 
10
  #
11
  # 3rd-Party Libraries
12
  from elasticsearch import Elasticsearch
@@ -64,7 +64,11 @@ from App_Function_Libraries.DB.SQLite_DB import (
64
  get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
65
  get_all_content_from_database as sqlite_get_all_content_from_database,
66
  get_next_media_id as sqlite_get_next_media_id, \
67
- batch_insert_chunks as sqlite_batch_insert_chunks, Database,
 
 
 
 
68
  )
69
  #
70
  # Local Imports
@@ -320,6 +324,19 @@ def add_media_to_database(*args, **kwargs):
320
  # Implement Elasticsearch version
321
  raise NotImplementedError("Elasticsearch version of add_media_to_database not yet implemented")
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  def import_obsidian_note_to_db(*args, **kwargs):
325
  if db_type == 'sqlite':
@@ -501,6 +518,24 @@ def mark_as_trash(media_id: int) -> None:
501
  else:
502
  raise ValueError(f"Unsupported database type: {db_type}")
503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  def get_media_content(media_id: int) -> str:
505
  if db_type == 'sqlite':
506
  return sqlite_get_media_content(media_id)
@@ -541,6 +576,29 @@ def get_specific_summary(summary_id: int) -> Dict:
541
  else:
542
  raise ValueError(f"Unsupported database type: {db_type}")
543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
  def get_media_prompts(media_id: int) -> List[Dict]:
545
  if db_type == 'sqlite':
546
  return sqlite_get_media_prompts(media_id)
@@ -790,9 +848,35 @@ def get_document_version(*args, **kwargs):
790
  # End of Document Versioning Functions
791
  ############################################################################################################
792
 
793
- def close_connection():
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  if db_type == 'sqlite':
795
- db.get_connection().close()
 
 
 
 
 
 
 
 
 
 
 
 
 
796
 
797
  #
798
  # End of file
 
5
  import configparser
6
  import os
7
  import logging
 
8
  import time
9
+ from typing import Tuple, List, Union, Dict
10
  #
11
  # 3rd-Party Libraries
12
  from elasticsearch import Elasticsearch
 
64
  get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
65
  get_all_content_from_database as sqlite_get_all_content_from_database,
66
  get_next_media_id as sqlite_get_next_media_id, \
67
+ batch_insert_chunks as sqlite_batch_insert_chunks, Database, save_workflow_chat_to_db as sqlite_save_workflow_chat_to_db, \
68
+ get_workflow_chat as sqlite_get_workflow_chat, update_media_content_with_version as sqlite_update_media_content_with_version, \
69
+ check_existing_media as sqlite_check_existing_media, get_all_document_versions as sqlite_get_all_document_versions, \
70
+ fetch_paginated_data as sqlite_fetch_paginated_data, get_latest_transcription as sqlite_get_latest_transcription, \
71
+
72
  )
73
  #
74
  # Local Imports
 
324
  # Implement Elasticsearch version
325
  raise NotImplementedError("Elasticsearch version of add_media_to_database not yet implemented")
326
 
327
+ def check_existing_media(*args, **kwargs):
328
+ if db_type == 'sqlite':
329
+ return sqlite_check_existing_media(*args, **kwargs)
330
+ elif db_type == 'elasticsearch':
331
+ # Implement Elasticsearch version
332
+ raise NotImplementedError("Elasticsearch version of check_existing_media not yet implemented")
333
+
334
+ def update_media_content_with_version(*args, **kwargs):
335
+ if db_type == 'sqlite':
336
+ return sqlite_update_media_content_with_version(*args, **kwargs)
337
+ elif db_type == 'elasticsearch':
338
+ # Implement Elasticsearch version
339
+ raise NotImplementedError("Elasticsearch version of update_media_content not yet implemented")
340
 
341
  def import_obsidian_note_to_db(*args, **kwargs):
342
  if db_type == 'sqlite':
 
518
  else:
519
  raise ValueError(f"Unsupported database type: {db_type}")
520
 
521
+
522
+ def get_latest_transcription(*args, **kwargs):
523
+ if db_type == 'sqlite':
524
+ return sqlite_get_latest_transcription(*args, **kwargs)
525
+ elif db_type == 'elasticsearch':
526
+ # Implement Elasticsearch version
527
+ raise NotImplementedError("Elasticsearch version of get_latest_transcription not yet implemented")
528
+
529
+ def fetch_paginated_data(*args, **kwargs):
530
+ if db_type == 'sqlite':
531
+ return sqlite_fetch_paginated_data(*args, **kwargs)
532
+ elif db_type == 'elasticsearch':
533
+ # Implement Elasticsearch version
534
+ raise NotImplementedError("Elasticsearch version of fetch_paginated_data not yet implemented")
535
+ else:
536
+ raise ValueError(f"Unsupported database type: {db_type}")
537
+
538
+
539
  def get_media_content(media_id: int) -> str:
540
  if db_type == 'sqlite':
541
  return sqlite_get_media_content(media_id)
 
576
  else:
577
  raise ValueError(f"Unsupported database type: {db_type}")
578
 
579
+ def fetch_item_details_single(*args, **kwargs):
580
+ if db_type == 'sqlite':
581
+ return sqlite_fetch_item_details(*args, **kwargs)
582
+ elif db_type == 'elasticsearch':
583
+ # Implement Elasticsearch version
584
+ raise NotImplementedError("Elasticsearch version of fetch_item_details not yet implemented")
585
+ else:
586
+ raise ValueError(f"Unsupported database type: {db_type}")
587
+
588
+ def get_all_document_versions(*args, **kwargs):
589
+ if db_type == 'sqlite':
590
+ return sqlite_get_all_document_versions(*args, **kwargs)
591
+ elif db_type == 'elasticsearch':
592
+ # Implement Elasticsearch version
593
+ raise NotImplementedError("Elasticsearch version of get_all_document_versions not yet implemented")
594
+ else:
595
+ raise ValueError(f"Unsupported database type: {db_type}")
596
+ #
597
+ #
598
+ ############################################################################################################
599
+ #
600
+ # Prompt Functions:
601
+
602
  def get_media_prompts(media_id: int) -> List[Dict]:
603
  if db_type == 'sqlite':
604
  return sqlite_get_media_prompts(media_id)
 
848
  # End of Document Versioning Functions
849
  ############################################################################################################
850
 
851
+
852
+ ############################################################################################################
853
+ #
854
+ # Workflow Functions
855
+
856
+ def get_workflow_chat(*args, **kwargs):
857
+ if db_type == 'sqlite':
858
+ return sqlite_get_workflow_chat(*args, **kwargs)
859
+ elif db_type == 'elasticsearch':
860
+ # Implement Elasticsearch version
861
+ raise NotImplementedError("Elasticsearch version of get_workflow_chat not yet implemented")
862
+
863
+
864
+ def save_workflow_chat_to_db(*args, **kwargs):
865
  if db_type == 'sqlite':
866
+ # FIXME
867
+ return sqlite_save_workflow_chat_to_db(*args, **kwargs)
868
+ elif db_type == 'elasticsearch':
869
+ # Implement Elasticsearch version
870
+ raise NotImplementedError("Elasticsearch version of save_workflow_chat_to_db not yet implemented")
871
+
872
+ #
873
+ # End of Workflow Functions
874
+ ############################################################################################################
875
+
876
+ # Dead code FIXME
877
+ # def close_connection():
878
+ # if db_type == 'sqlite':
879
+ # db.get_connection().close()
880
 
881
  #
882
  # End of file
App_Function_Libraries/DB/SQLite_DB.py CHANGED
@@ -46,6 +46,7 @@ import configparser
46
  #
47
  # Import necessary libraries
48
  import csv
 
49
  import html
50
  import logging
51
  import os
@@ -53,9 +54,13 @@ import queue
53
  import re
54
  import shutil
55
  import sqlite3
 
56
  import traceback
 
57
  from datetime import datetime, timedelta
58
  from typing import List, Tuple, Dict, Any, Optional
 
 
59
  # Local Libraries
60
  from App_Function_Libraries.Utils.Utils import get_project_relative_path, get_database_path, \
61
  get_database_dir
@@ -219,39 +224,62 @@ class DatabaseError(Exception):
219
  class InputError(Exception):
220
  pass
221
 
 
222
  class Database:
223
  def __init__(self, db_name='media_summary.db'):
224
  self.db_path = get_database_path(db_name)
225
- self.timeout = 60.0 # 60 seconds timeout
 
226
 
 
227
  def get_connection(self):
228
- return sqlite3.connect(self.db_path, timeout=self.timeout)
229
-
230
- def execute_query(self, query: str, params: Tuple = ()) -> None:
 
 
 
 
 
 
 
 
 
231
  with self.get_connection() as conn:
232
  try:
233
- cursor = conn.cursor()
234
- cursor.execute(query, params)
235
- conn.commit()
236
- except sqlite3.Error as e:
237
- logging.error(f"Database error: {e}, Query: {query}")
238
- raise DatabaseError(f"Database error: {e}, Query: {query}")
 
 
 
 
 
 
 
 
 
239
 
240
  def execute_many(self, query: str, params_list: List[Tuple]) -> None:
241
  with self.get_connection() as conn:
242
- try:
243
- cursor = conn.cursor()
244
- cursor.executemany(query, params_list)
245
- conn.commit()
246
- except sqlite3.Error as e:
247
- logging.error(f"Database error: {e}, Query: {query}")
248
- raise DatabaseError(f"Database error: {e}, Query: {query}")
249
 
250
  db = Database()
251
 
252
- def instantiate_sqlite_db():
253
- global sqlite_db
254
- sqlite_db = Database()
 
 
255
 
256
 
257
  # Function to create tables with the new media schema
@@ -399,7 +427,6 @@ def create_tables(db) -> None:
399
  'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_media_id ON UnvectorizedMediaChunks(media_id)',
400
  'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_is_processed ON UnvectorizedMediaChunks(is_processed)',
401
  'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_chunk_type ON UnvectorizedMediaChunks(chunk_type)',
402
- # CREATE UNIQUE INDEX statements
403
  'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_url ON Media(url)',
404
  'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id)',
405
  'CREATE INDEX IF NOT EXISTS idx_document_versions_media_id ON DocumentVersions(media_id)',
@@ -426,6 +453,14 @@ def create_tables(db) -> None:
426
 
427
  create_tables(db)
428
 
 
 
 
 
 
 
 
 
429
 
430
  def check_media_exists(title: str, url: str) -> Optional[int]:
431
  try:
@@ -560,11 +595,20 @@ def add_keyword(keyword: str) -> int:
560
  with db.get_connection() as conn:
561
  cursor = conn.cursor()
562
  try:
 
563
  cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
 
 
564
  cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
565
  keyword_id = cursor.fetchone()[0]
566
- cursor.execute('INSERT OR IGNORE INTO keyword_fts (rowid, keyword) VALUES (?, ?)', (keyword_id, keyword))
567
- logging.info(f"Keyword '{keyword}' added to keyword_fts with ID: {keyword_id}")
 
 
 
 
 
 
568
  conn.commit()
569
  return keyword_id
570
  except sqlite3.IntegrityError as e:
@@ -575,6 +619,7 @@ def add_keyword(keyword: str) -> int:
575
  raise DatabaseError(f"Error adding keyword: {e}")
576
 
577
 
 
578
  # Function to delete a keyword
579
  def delete_keyword(keyword: str) -> str:
580
  keyword = keyword.strip().lower()
@@ -881,10 +926,12 @@ def browse_items(search_query, search_type):
881
 
882
 
883
  # Function to fetch item details
 
884
  def fetch_item_details(media_id: int):
885
  try:
886
  with db.get_connection() as conn:
887
  cursor = conn.cursor()
 
888
  cursor.execute("""
889
  SELECT prompt, summary
890
  FROM MediaModifications
@@ -893,18 +940,19 @@ def fetch_item_details(media_id: int):
893
  LIMIT 1
894
  """, (media_id,))
895
  prompt_summary_result = cursor.fetchone()
 
 
896
  cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
897
  content_result = cursor.fetchone()
898
 
899
- prompt = prompt_summary_result[0] if prompt_summary_result else ""
900
- summary = prompt_summary_result[1] if prompt_summary_result else ""
901
- content = content_result[0] if content_result else ""
902
 
903
- return content, prompt, summary
904
  except sqlite3.Error as e:
905
  logging.error(f"Error fetching item details: {e}")
906
- # Return empty strings if there's an error
907
- return "", "", ""
908
 
909
  #
910
  # End of Media-related Functions
@@ -916,7 +964,6 @@ def fetch_item_details(media_id: int):
916
  # Media-related Functions
917
 
918
 
919
-
920
  # Function to add a version of a prompt and summary
921
  def add_media_version(conn, media_id: int, prompt: str, summary: str) -> None:
922
  try:
@@ -1113,12 +1160,22 @@ def is_valid_date(date_string: str) -> bool:
1113
  except ValueError:
1114
  return False
1115
 
1116
- def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
 
 
 
1117
  db = Database()
1118
  try:
1119
  with db.get_connection() as conn:
1120
  cursor = conn.cursor()
1121
 
 
 
 
 
 
 
 
1122
  # Extract content from segments
1123
  if isinstance(segments, list):
1124
  content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
@@ -1140,13 +1197,14 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
1140
  existing_media = cursor.fetchone()
1141
 
1142
  if existing_media:
1143
- media_id = existing_media[0]
1144
- cursor.execute('''
1145
- UPDATE Media
1146
- SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?, chunking_status = ?
1147
- WHERE id = ?
1148
- ''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
1149
- info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
 
1150
  else:
1151
  cursor.execute('''
1152
  INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model, chunking_status)
@@ -1186,7 +1244,8 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
1186
  # Schedule chunking
1187
  schedule_chunking(media_id, content, info_dict.get('title', 'Untitled'))
1188
 
1189
- return f"Media '{info_dict.get('title', 'Untitled')}' added/updated successfully with keywords: {', '.join(keyword_list)}. Chunking scheduled."
 
1190
 
1191
  except DatabaseError as e:
1192
  logging.error(f"Database error: {e}")
@@ -1196,6 +1255,66 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
1196
  raise DatabaseError(f"Unexpected error: {e}")
1197
 
1198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1199
  # FIXME: This function is not complete and needs to be implemented
1200
  def schedule_chunking(media_id: int, content: str, media_name: str):
1201
  try:
@@ -1622,13 +1741,14 @@ def fetch_item_details_single(media_id: int):
1622
  cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
1623
  content_result = cursor.fetchone()
1624
 
1625
- prompt = prompt_summary_result[0] if prompt_summary_result else ""
1626
- summary = prompt_summary_result[1] if prompt_summary_result else ""
1627
- content = content_result[0] if content_result else ""
1628
 
1629
  return prompt, summary, content
1630
  except sqlite3.Error as e:
1631
- raise Exception(f"Error fetching item details: {e}")
 
1632
 
1633
 
1634
 
@@ -2045,6 +2165,22 @@ def get_transcripts(media_id):
2045
  logging.error(f"Error in get_transcripts: {str(e)}")
2046
  return []
2047
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2048
 
2049
  #
2050
  # End of Functions to Compare Transcripts
@@ -2436,32 +2572,57 @@ def get_paginated_files(page: int = 1, results_per_page: int = 50) -> Tuple[List
2436
  #
2437
  # Functions to manage document versions
2438
 
 
2439
  def create_document_version(media_id: int, content: str) -> int:
 
2440
  try:
2441
  with db.get_connection() as conn:
2442
  cursor = conn.cursor()
 
 
 
2443
 
2444
- # Get the latest version number
2445
- cursor.execute('''
2446
- SELECT MAX(version_number)
2447
- FROM DocumentVersions
2448
- WHERE media_id = ?
2449
- ''', (media_id,))
 
 
 
 
2450
 
2451
- latest_version = cursor.fetchone()[0] or 0
2452
- new_version = latest_version + 1
2453
 
2454
- # Insert new version
2455
- cursor.execute('''
2456
- INSERT INTO DocumentVersions (media_id, version_number, content)
2457
- VALUES (?, ?, ?)
2458
- ''', (media_id, new_version, content))
2459
 
2460
- conn.commit()
2461
- return new_version
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2462
  except sqlite3.Error as e:
2463
- logging.error(f"Error creating document version: {e}")
2464
- raise DatabaseError(f"Error creating document version: {e}")
 
 
 
 
 
2465
 
2466
 
2467
  def get_document_version(media_id: int, version_number: int = None) -> Dict[str, Any]:
@@ -2501,6 +2662,91 @@ def get_document_version(media_id: int, version_number: int = None) -> Dict[str,
2501
  logging.error(error_message)
2502
  return {'error': error_message}
2503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2504
  #
2505
  # End of Functions to manage document versions
2506
  #######################################################################################################################
@@ -2653,3 +2899,102 @@ def update_media_table(db):
2653
  #
2654
  # End of Functions to manage media chunks
2655
  #######################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  #
47
  # Import necessary libraries
48
  import csv
49
+ import hashlib
50
  import html
51
  import logging
52
  import os
 
54
  import re
55
  import shutil
56
  import sqlite3
57
+ import threading
58
  import traceback
59
+ from contextlib import contextmanager
60
  from datetime import datetime, timedelta
61
  from typing import List, Tuple, Dict, Any, Optional
62
+ from urllib.parse import quote
63
+
64
  # Local Libraries
65
  from App_Function_Libraries.Utils.Utils import get_project_relative_path, get_database_path, \
66
  get_database_dir
 
224
  class InputError(Exception):
225
  pass
226
 
227
+
228
  class Database:
229
  def __init__(self, db_name='media_summary.db'):
230
  self.db_path = get_database_path(db_name)
231
+ self.timeout = 10.0
232
+ self._local = threading.local()
233
 
234
+ @contextmanager
235
  def get_connection(self):
236
+ if not hasattr(self._local, 'connection') or self._local.connection is None:
237
+ self._local.connection = sqlite3.connect(self.db_path, timeout=self.timeout)
238
+ self._local.connection.isolation_level = None # This enables autocommit mode
239
+ yield self._local.connection
240
+
241
+ def close_connection(self):
242
+ if hasattr(self._local, 'connection') and self._local.connection:
243
+ self._local.connection.close()
244
+ self._local.connection = None
245
+
246
+ @contextmanager
247
+ def transaction(self):
248
  with self.get_connection() as conn:
249
  try:
250
+ conn.execute("BEGIN")
251
+ yield conn
252
+ conn.execute("COMMIT")
253
+ except Exception:
254
+ conn.execute("ROLLBACK")
255
+ raise
256
+
257
+ def execute_query(self, query: str, params: Tuple = ()) -> Any:
258
+ with self.get_connection() as conn:
259
+ cursor = conn.cursor()
260
+ cursor.execute(query, params)
261
+ if query.strip().upper().startswith("SELECT"):
262
+ return cursor.fetchall()
263
+ else:
264
+ return cursor.rowcount
265
 
266
  def execute_many(self, query: str, params_list: List[Tuple]) -> None:
267
  with self.get_connection() as conn:
268
+ cursor = conn.cursor()
269
+ cursor.executemany(query, params_list)
270
+
271
+ def table_exists(self, table_name: str) -> bool:
272
+ query = 'SELECT name FROM sqlite_master WHERE type="table" AND name=?'
273
+ result = self.execute_query(query, (table_name,))
274
+ return bool(result)
275
 
276
  db = Database()
277
 
278
+ # Usage example:
279
+ if db.table_exists('DocumentVersions'):
280
+ logging.info("DocumentVersions table exists")
281
+ else:
282
+ logging.error("DocumentVersions table does not exist")
283
 
284
 
285
  # Function to create tables with the new media schema
 
427
  'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_media_id ON UnvectorizedMediaChunks(media_id)',
428
  'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_is_processed ON UnvectorizedMediaChunks(is_processed)',
429
  'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_chunk_type ON UnvectorizedMediaChunks(chunk_type)',
 
430
  'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_url ON Media(url)',
431
  'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id)',
432
  'CREATE INDEX IF NOT EXISTS idx_document_versions_media_id ON DocumentVersions(media_id)',
 
453
 
454
  create_tables(db)
455
 
456
+ #
457
+ # End of DB Setup Functions
458
+ #######################################################################################################################
459
+
460
+
461
+ #######################################################################################################################
462
+ #
463
+ # Media-related Functions
464
 
465
  def check_media_exists(title: str, url: str) -> Optional[int]:
466
  try:
 
595
  with db.get_connection() as conn:
596
  cursor = conn.cursor()
597
  try:
598
+ # Insert into Keywords table
599
  cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
600
+
601
+ # Get the keyword_id (whether it was just inserted or already existed)
602
  cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
603
  keyword_id = cursor.fetchone()[0]
604
+
605
+ # Check if the keyword exists in keyword_fts
606
+ cursor.execute('SELECT rowid FROM keyword_fts WHERE rowid = ?', (keyword_id,))
607
+ if not cursor.fetchone():
608
+ # If it doesn't exist in keyword_fts, insert it
609
+ cursor.execute('INSERT OR IGNORE INTO keyword_fts (rowid, keyword) VALUES (?, ?)', (keyword_id, keyword))
610
+
611
+ logging.info(f"Keyword '{keyword}' added or updated with ID: {keyword_id}")
612
  conn.commit()
613
  return keyword_id
614
  except sqlite3.IntegrityError as e:
 
619
  raise DatabaseError(f"Error adding keyword: {e}")
620
 
621
 
622
+
623
  # Function to delete a keyword
624
  def delete_keyword(keyword: str) -> str:
625
  keyword = keyword.strip().lower()
 
926
 
927
 
928
  # Function to fetch item details
929
+
930
  def fetch_item_details(media_id: int):
931
  try:
932
  with db.get_connection() as conn:
933
  cursor = conn.cursor()
934
+ # Fetch the latest prompt and summary from MediaModifications
935
  cursor.execute("""
936
  SELECT prompt, summary
937
  FROM MediaModifications
 
940
  LIMIT 1
941
  """, (media_id,))
942
  prompt_summary_result = cursor.fetchone()
943
+
944
+ # Fetch the latest transcription
945
  cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
946
  content_result = cursor.fetchone()
947
 
948
+ prompt = prompt_summary_result[0] if prompt_summary_result else "No prompt available."
949
+ summary = prompt_summary_result[1] if prompt_summary_result else "No summary available."
950
+ content = content_result[0] if content_result else "No content available."
951
 
952
+ return prompt, summary, content
953
  except sqlite3.Error as e:
954
  logging.error(f"Error fetching item details: {e}")
955
+ return "Error fetching prompt.", "Error fetching summary.", "Error fetching media."
 
956
 
957
  #
958
  # End of Media-related Functions
 
964
  # Media-related Functions
965
 
966
 
 
967
  # Function to add a version of a prompt and summary
968
  def add_media_version(conn, media_id: int, prompt: str, summary: str) -> None:
969
  try:
 
1160
  except ValueError:
1161
  return False
1162
 
1163
+
1164
+
1165
+
1166
+ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video', overwrite=False):
1167
  db = Database()
1168
  try:
1169
  with db.get_connection() as conn:
1170
  cursor = conn.cursor()
1171
 
1172
+ # Generate URL if not provided
1173
+ if not url:
1174
+ title = info_dict.get('title', 'Untitled')
1175
+ url_hash = hashlib.md5(f"{title}{media_type}".encode()).hexdigest()
1176
+ url = f"https://No-URL-Submitted.com/{media_type}/{quote(title)}-{url_hash}"
1177
+
1178
+
1179
  # Extract content from segments
1180
  if isinstance(segments, list):
1181
  content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
 
1197
  existing_media = cursor.fetchone()
1198
 
1199
  if existing_media:
1200
+ if overwrite:
1201
+ media_id = existing_media[0]
1202
+ cursor.execute('''
1203
+ UPDATE Media
1204
+ SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?, chunking_status = ?
1205
+ WHERE id = ?
1206
+ ''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
1207
+ info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
1208
  else:
1209
  cursor.execute('''
1210
  INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model, chunking_status)
 
1244
  # Schedule chunking
1245
  schedule_chunking(media_id, content, info_dict.get('title', 'Untitled'))
1246
 
1247
+ action = "updated" if existing_media and overwrite else "added"
1248
+ return f"Media '{info_dict.get('title', 'Untitled')}' {action} successfully with URL: {url} and keywords: {', '.join(keyword_list)}. Chunking scheduled."
1249
 
1250
  except DatabaseError as e:
1251
  logging.error(f"Database error: {e}")
 
1255
  raise DatabaseError(f"Unexpected error: {e}")
1256
 
1257
 
1258
+ def check_existing_media(url):
1259
+ db = Database()
1260
+ try:
1261
+ with db.get_connection() as conn:
1262
+ cursor = conn.cursor()
1263
+ cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
1264
+ result = cursor.fetchone()
1265
+ return {'id': result[0]} if result else None
1266
+ except Exception as e:
1267
+ logging.error(f"Error checking existing media: {e}")
1268
+ return None
1269
+
1270
+
1271
+ # Modified update_media_content function to create a new version
1272
+ def update_media_content_with_version(media_id, info_dict, content_input, prompt_input, summary_input, whisper_model):
1273
+ db = Database()
1274
+ try:
1275
+ with db.get_connection() as conn:
1276
+ cursor = conn.cursor()
1277
+
1278
+ # Create new document version
1279
+ cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
1280
+ current_version = cursor.fetchone()[0] or 0
1281
+ new_version = current_version + 1
1282
+
1283
+ # Insert new version
1284
+ cursor.execute('''
1285
+ INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
1286
+ VALUES (?, ?, ?, ?, ?)
1287
+ ''', (media_id, new_version, prompt_input, summary_input, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
1288
+
1289
+ # Update the main content in the Media table
1290
+ cursor.execute('''
1291
+ UPDATE Media
1292
+ SET content = ?, transcription_model = ?, title = ?, author = ?, ingestion_date = ?, chunking_status = ?
1293
+ WHERE id = ?
1294
+ ''', (content_input, whisper_model, info_dict.get('title', 'Untitled'),
1295
+ info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
1296
+
1297
+ # Update or insert into MediaModifications
1298
+ cursor.execute('''
1299
+ INSERT OR REPLACE INTO MediaModifications (media_id, prompt, summary, modification_date)
1300
+ VALUES (?, ?, ?, ?)
1301
+ ''', (media_id, prompt_input, summary_input, datetime.now().strftime('%Y-%m-%d')))
1302
+
1303
+ # Update full-text search index
1304
+ cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
1305
+ (media_id, info_dict.get('title', 'Untitled'), content_input))
1306
+
1307
+ conn.commit()
1308
+
1309
+ # Schedule chunking
1310
+ schedule_chunking(media_id, content_input, info_dict.get('title', 'Untitled'))
1311
+
1312
+ return f"Content updated successfully for media ID: {media_id}. New version: {new_version}"
1313
+ except Exception as e:
1314
+ logging.error(f"Error updating media content: {e}")
1315
+ return f"Error updating content: {str(e)}"
1316
+
1317
+
1318
  # FIXME: This function is not complete and needs to be implemented
1319
  def schedule_chunking(media_id: int, content: str, media_name: str):
1320
  try:
 
1741
  cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
1742
  content_result = cursor.fetchone()
1743
 
1744
+ prompt = prompt_summary_result[0] if prompt_summary_result else "No prompt available."
1745
+ summary = prompt_summary_result[1] if prompt_summary_result else "No summary available."
1746
+ content = content_result[0] if content_result else "No content available."
1747
 
1748
  return prompt, summary, content
1749
  except sqlite3.Error as e:
1750
+ logging.error(f"Error fetching item details: {e}")
1751
+ return "Error fetching prompt.", "Error fetching summary.", "Error fetching content."
1752
 
1753
 
1754
 
 
2165
  logging.error(f"Error in get_transcripts: {str(e)}")
2166
  return []
2167
 
2168
+ def get_latest_transcription(media_id: int):
2169
+ try:
2170
+ with db.get_connection() as conn:
2171
+ cursor = conn.cursor()
2172
+ cursor.execute("""
2173
+ SELECT transcription
2174
+ FROM Transcripts
2175
+ WHERE media_id = ?
2176
+ ORDER BY created_at DESC
2177
+ LIMIT 1
2178
+ """, (media_id,))
2179
+ result = cursor.fetchone()
2180
+ return result[0] if result else "No transcription available."
2181
+ except sqlite3.Error as e:
2182
+ logging.error(f"Error fetching latest transcription: {e}")
2183
+ return "Error fetching transcription."
2184
 
2185
  #
2186
  # End of Functions to Compare Transcripts
 
2572
  #
2573
  # Functions to manage document versions
2574
 
2575
+
2576
  def create_document_version(media_id: int, content: str) -> int:
2577
+ logging.info(f"Attempting to create document version for media_id: {media_id}")
2578
  try:
2579
  with db.get_connection() as conn:
2580
  cursor = conn.cursor()
2581
+
2582
+ # Start a transaction
2583
+ cursor.execute("BEGIN EXCLUSIVE TRANSACTION")
2584
 
2585
+ try:
2586
+ # Verify media_id exists and get the latest version in one query
2587
+ cursor.execute('''
2588
+ SELECT m.id, COALESCE(MAX(dv.version_number), 0)
2589
+ FROM Media m
2590
+ LEFT JOIN DocumentVersions dv ON m.id = dv.media_id
2591
+ WHERE m.id = ?
2592
+ GROUP BY m.id
2593
+ ''', (media_id,))
2594
+ result = cursor.fetchone()
2595
 
2596
+ if not result:
2597
+ raise ValueError(f"No Media entry found for id: {media_id}")
2598
 
2599
+ _, latest_version = result
2600
+ new_version = latest_version + 1
 
 
 
2601
 
2602
+ logging.debug(f"Inserting new version {new_version} for media_id: {media_id}")
2603
+
2604
+ # Insert new version
2605
+ cursor.execute('''
2606
+ INSERT INTO DocumentVersions (media_id, version_number, content)
2607
+ VALUES (?, ?, ?)
2608
+ ''', (media_id, new_version, content))
2609
+
2610
+ # Commit the transaction
2611
+ conn.commit()
2612
+ logging.info(f"Successfully created document version {new_version} for media_id: {media_id}")
2613
+ return new_version
2614
+ except Exception as e:
2615
+ # If any error occurs, roll back the transaction
2616
+ conn.rollback()
2617
+ raise e
2618
  except sqlite3.Error as e:
2619
+ logging.error(f"Database error creating document version: {e}")
2620
+ logging.error(f"Error details - media_id: {media_id}, content length: {len(content)}")
2621
+ raise DatabaseError(f"Failed to create document version: {e}")
2622
+ except Exception as e:
2623
+ logging.error(f"Unexpected error creating document version: {e}")
2624
+ logging.error(f"Error details - media_id: {media_id}, content length: {len(content)}")
2625
+ raise
2626
 
2627
 
2628
  def get_document_version(media_id: int, version_number: int = None) -> Dict[str, Any]:
 
2662
  logging.error(error_message)
2663
  return {'error': error_message}
2664
 
2665
+ def get_all_document_versions(media_id: int) -> List[Dict[str, Any]]:
2666
+ try:
2667
+ with db.get_connection() as conn:
2668
+ cursor = conn.cursor()
2669
+ cursor.execute('''
2670
+ SELECT id, version_number, content, created_at
2671
+ FROM DocumentVersions
2672
+ WHERE media_id = ?
2673
+ ORDER BY version_number DESC
2674
+ ''', (media_id,))
2675
+ results = cursor.fetchall()
2676
+
2677
+ if results:
2678
+ return [
2679
+ {
2680
+ 'id': row[0],
2681
+ 'version_number': row[1],
2682
+ 'content': row[2],
2683
+ 'created_at': row[3]
2684
+ }
2685
+ for row in results
2686
+ ]
2687
+ else:
2688
+ return []
2689
+ except sqlite3.Error as e:
2690
+ error_message = f"Error retrieving all document versions: {e}"
2691
+ logging.error(error_message)
2692
+ return [{'error': error_message}]
2693
+
2694
+ def delete_document_version(media_id: int, version_number: int) -> Dict[str, Any]:
2695
+ try:
2696
+ with db.get_connection() as conn:
2697
+ cursor = conn.cursor()
2698
+ cursor.execute('''
2699
+ DELETE FROM DocumentVersions
2700
+ WHERE media_id = ? AND version_number = ?
2701
+ ''', (media_id, version_number))
2702
+ conn.commit()
2703
+
2704
+ if cursor.rowcount > 0:
2705
+ return {'success': f"Document version {version_number} for media_id {media_id} deleted successfully"}
2706
+ else:
2707
+ return {'error': f"No document version found for media_id {media_id} and version_number {version_number}"}
2708
+ except sqlite3.Error as e:
2709
+ error_message = f"Error deleting document version: {e}"
2710
+ logging.error(error_message)
2711
+ return {'error': error_message}
2712
+
2713
+ def rollback_to_version(media_id: int, version_number: int) -> Dict[str, Any]:
2714
+ try:
2715
+ with db.get_connection() as conn:
2716
+ cursor = conn.cursor()
2717
+
2718
+ # Get the content of the version to rollback to
2719
+ cursor.execute('''
2720
+ SELECT content
2721
+ FROM DocumentVersions
2722
+ WHERE media_id = ? AND version_number = ?
2723
+ ''', (media_id, version_number))
2724
+ result = cursor.fetchone()
2725
+
2726
+ if not result:
2727
+ return {'error': f"No document version found for media_id {media_id} and version_number {version_number}"}
2728
+
2729
+ rollback_content = result[0]
2730
+
2731
+ # Create a new version with the content of the version to rollback to
2732
+ cursor.execute('''
2733
+ INSERT INTO DocumentVersions (media_id, version_number, content)
2734
+ VALUES (?, (SELECT COALESCE(MAX(version_number), 0) + 1 FROM DocumentVersions WHERE media_id = ?), ?)
2735
+ ''', (media_id, media_id, rollback_content))
2736
+
2737
+ new_version_number = cursor.lastrowid
2738
+
2739
+ conn.commit()
2740
+
2741
+ return {
2742
+ 'success': f"Rolled back to version {version_number} for media_id {media_id}",
2743
+ 'new_version_number': new_version_number
2744
+ }
2745
+ except sqlite3.Error as e:
2746
+ error_message = f"Error rolling back to document version: {e}"
2747
+ logging.error(error_message)
2748
+ return {'error': error_message}
2749
+
2750
  #
2751
  # End of Functions to manage document versions
2752
  #######################################################################################################################
 
2899
  #
2900
  # End of Functions to manage media chunks
2901
  #######################################################################################################################
2902
+
2903
+
2904
+ #######################################################################################################################
2905
+ #
2906
+ # Workflow Functions
2907
+
2908
+ def save_workflow_chat_to_db(chat_history, workflow_name, conversation_id=None):
2909
+ try:
2910
+ with db.get_connection() as conn:
2911
+ cursor = conn.cursor()
2912
+
2913
+ if conversation_id is None:
2914
+ # Create a new conversation
2915
+ conversation_name = f"{workflow_name}_Workflow_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
2916
+ cursor.execute('''
2917
+ INSERT INTO ChatConversations (media_id, media_name, conversation_name, created_at, updated_at)
2918
+ VALUES (NULL, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
2919
+ ''', (workflow_name, conversation_name))
2920
+ conversation_id = cursor.lastrowid
2921
+ else:
2922
+ # Update existing conversation
2923
+ cursor.execute('''
2924
+ UPDATE ChatConversations
2925
+ SET updated_at = CURRENT_TIMESTAMP
2926
+ WHERE id = ?
2927
+ ''', (conversation_id,))
2928
+
2929
+ # Save messages
2930
+ for user_msg, ai_msg in chat_history:
2931
+ if user_msg:
2932
+ cursor.execute('''
2933
+ INSERT INTO ChatMessages (conversation_id, sender, message, timestamp)
2934
+ VALUES (?, 'user', ?, CURRENT_TIMESTAMP)
2935
+ ''', (conversation_id, user_msg))
2936
+ if ai_msg:
2937
+ cursor.execute('''
2938
+ INSERT INTO ChatMessages (conversation_id, sender, message, timestamp)
2939
+ VALUES (?, 'ai', ?, CURRENT_TIMESTAMP)
2940
+ ''', (conversation_id, ai_msg))
2941
+
2942
+ conn.commit()
2943
+
2944
+ return conversation_id, f"Chat saved successfully! Conversation ID: {conversation_id}"
2945
+ except Exception as e:
2946
+ logging.error(f"Error saving workflow chat to database: {str(e)}")
2947
+ return None, f"Error saving chat to database: {str(e)}"
2948
+
2949
+
2950
+ def get_workflow_chat(conversation_id):
2951
+ """
2952
+ Retrieve a workflow chat from the database.
2953
+
2954
+ Args:
2955
+ conversation_id: ID of the conversation to retrieve
2956
+
2957
+ Returns:
2958
+ tuple: (chat_history, workflow_name, status_message)
2959
+ """
2960
+ try:
2961
+ with db.get_connection() as conn:
2962
+ cursor = conn.cursor()
2963
+
2964
+ # Get conversation details
2965
+ cursor.execute('''
2966
+ SELECT media_name, conversation_name FROM ChatConversations
2967
+ WHERE id = ?
2968
+ ''', (conversation_id,))
2969
+ result = cursor.fetchone()
2970
+ if not result:
2971
+ return None, None, "Conversation not found"
2972
+
2973
+ workflow_name, conversation_name = result
2974
+
2975
+ # Get chat messages
2976
+ cursor.execute('''
2977
+ SELECT sender, message FROM ChatMessages
2978
+ WHERE conversation_id = ?
2979
+ ORDER BY timestamp
2980
+ ''', (conversation_id,))
2981
+ messages = cursor.fetchall()
2982
+
2983
+ chat_history = []
2984
+ for sender, message in messages:
2985
+ if sender == 'user':
2986
+ chat_history.append((message, None))
2987
+ else:
2988
+ if chat_history and chat_history[-1][1] is None:
2989
+ chat_history[-1] = (chat_history[-1][0], message)
2990
+ else:
2991
+ chat_history.append((None, message))
2992
+
2993
+ return chat_history, workflow_name, f"Chat retrieved successfully"
2994
+ except Exception as e:
2995
+ logging.error(f"Error retrieving workflow chat from database: {str(e)}")
2996
+ return None, None, f"Error retrieving chat from database: {str(e)}"
2997
+
2998
+ #
2999
+ # End of Workflow Functions
3000
+ #######################################################################################################################
App_Function_Libraries/DB/Test_SQLite_DB.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Test_SQLite_DB.py
2
+ # Description: Test file for SQLite_DB.py
3
+ #
4
+ # Usage: python -m unittest test_sqlite_db.py
5
+ #
6
+ # Imports
7
+ import unittest
8
+ import sqlite3
9
+ import threading
10
+ import time
11
+ from unittest.mock import patch
12
+ #
13
+ # Local Imports
14
+ from App_Function_Libraries.DB.SQLite_DB import Database, add_media_with_keywords, add_media_version, DatabaseError
15
+ #
16
+ #######################################################################################################################
17
+ #
18
+ # Functions:
19
+
20
+ class TestDatabase(unittest.TestCase):
21
+ def setUp(self):
22
+ self.db = Database(':memory:') # Use in-memory database for testing
23
+
24
+ def test_connection_management(self):
25
+ with self.db.get_connection() as conn:
26
+ self.assertIsInstance(conn, sqlite3.Connection)
27
+ self.assertEqual(len(self.db.pool), 1)
28
+
29
+ def test_execute_query(self):
30
+ self.db.execute_query("CREATE TABLE test (id INTEGER PRIMARY KEY, name TEXT)")
31
+ self.db.execute_query("INSERT INTO test (name) VALUES (?)", ("test_name",))
32
+ with self.db.get_connection() as conn:
33
+ cursor = conn.cursor()
34
+ cursor.execute("SELECT name FROM test")
35
+ result = cursor.fetchone()
36
+ self.assertEqual(result[0], "test_name")
37
+
38
+ def test_execute_many(self):
39
+ self.db.execute_query("CREATE TABLE test (id INTEGER PRIMARY KEY, name TEXT)")
40
+ data = [("name1",), ("name2",), ("name3",)]
41
+ self.db.execute_many("INSERT INTO test (name) VALUES (?)", data)
42
+ with self.db.get_connection() as conn:
43
+ cursor = conn.cursor()
44
+ cursor.execute("SELECT COUNT(*) FROM test")
45
+ count = cursor.fetchone()[0]
46
+ self.assertEqual(count, 3)
47
+
48
+ def test_connection_retry(self):
49
+ def lock_database():
50
+ with self.db.get_connection() as conn:
51
+ cursor = conn.cursor()
52
+ cursor.execute("BEGIN EXCLUSIVE TRANSACTION")
53
+ time.sleep(2) # Hold the lock for 2 seconds
54
+
55
+ thread = threading.Thread(target=lock_database)
56
+ thread.start()
57
+ time.sleep(0.1) # Give the thread time to acquire the lock
58
+
59
+ with self.assertRaises(DatabaseError):
60
+ self.db.execute_query("SELECT 1") # This should retry and eventually fail
61
+
62
+ thread.join()
63
+
64
+ class TestAddMediaWithKeywords(unittest.TestCase):
65
+ def setUp(self):
66
+ self.db = Database(':memory:')
67
+ self.db.execute_query("""
68
+ CREATE TABLE Media (
69
+ id INTEGER PRIMARY KEY,
70
+ url TEXT,
71
+ title TEXT NOT NULL,
72
+ type TEXT NOT NULL,
73
+ content TEXT,
74
+ author TEXT,
75
+ ingestion_date TEXT,
76
+ transcription_model TEXT
77
+ )
78
+ """)
79
+ self.db.execute_query("CREATE TABLE Keywords (id INTEGER PRIMARY KEY, keyword TEXT NOT NULL UNIQUE)")
80
+ self.db.execute_query("""
81
+ CREATE TABLE MediaKeywords (
82
+ id INTEGER PRIMARY KEY,
83
+ media_id INTEGER NOT NULL,
84
+ keyword_id INTEGER NOT NULL,
85
+ FOREIGN KEY (media_id) REFERENCES Media(id),
86
+ FOREIGN KEY (keyword_id) REFERENCES Keywords(id)
87
+ )
88
+ """)
89
+ self.db.execute_query("""
90
+ CREATE TABLE MediaModifications (
91
+ id INTEGER PRIMARY KEY,
92
+ media_id INTEGER NOT NULL,
93
+ prompt TEXT,
94
+ summary TEXT,
95
+ modification_date TEXT,
96
+ FOREIGN KEY (media_id) REFERENCES Media(id)
97
+ )
98
+ """)
99
+ self.db.execute_query("""
100
+ CREATE TABLE MediaVersion (
101
+ id INTEGER PRIMARY KEY,
102
+ media_id INTEGER NOT NULL,
103
+ version INTEGER NOT NULL,
104
+ prompt TEXT,
105
+ summary TEXT,
106
+ created_at TEXT NOT NULL,
107
+ FOREIGN KEY (media_id) REFERENCES Media(id)
108
+ )
109
+ """)
110
+ self.db.execute_query("CREATE VIRTUAL TABLE media_fts USING fts5(title, content)")
111
+
112
+ @patch('App_Function_Libraries.DB.SQLite_DB.db', new_callable=lambda: Database(':memory:'))
113
+ def test_add_new_media(self, mock_db):
114
+ mock_db.get_connection = self.db.get_connection
115
+ result = add_media_with_keywords(
116
+ url="http://example.com",
117
+ title="Test Title",
118
+ media_type="article",
119
+ content="Test content",
120
+ keywords="test,keyword",
121
+ prompt="Test prompt",
122
+ summary="Test summary",
123
+ transcription_model="Test model",
124
+ author="Test Author",
125
+ ingestion_date="2023-01-01"
126
+ )
127
+ self.assertIn("added/updated successfully", result)
128
+
129
+ with self.db.get_connection() as conn:
130
+ cursor = conn.cursor()
131
+ cursor.execute("SELECT COUNT(*) FROM Media")
132
+ self.assertEqual(cursor.fetchone()[0], 1)
133
+
134
+ cursor.execute("SELECT COUNT(*) FROM Keywords")
135
+ self.assertEqual(cursor.fetchone()[0], 2)
136
+
137
+ cursor.execute("SELECT COUNT(*) FROM MediaKeywords")
138
+ self.assertEqual(cursor.fetchone()[0], 2)
139
+
140
+ cursor.execute("SELECT COUNT(*) FROM MediaModifications")
141
+ self.assertEqual(cursor.fetchone()[0], 1)
142
+
143
+ cursor.execute("SELECT COUNT(*) FROM MediaVersion")
144
+ self.assertEqual(cursor.fetchone()[0], 1)
145
+
146
+ @patch('App_Function_Libraries.DB.SQLite_DB.db', new_callable=lambda: Database(':memory:'))
147
+ def test_update_existing_media(self, mock_db):
148
+ mock_db.get_connection = self.db.get_connection
149
+ add_media_with_keywords(
150
+ url="http://example.com",
151
+ title="Test Title",
152
+ media_type="article",
153
+ content="Test content",
154
+ keywords="test,keyword",
155
+ prompt="Test prompt",
156
+ summary="Test summary",
157
+ transcription_model="Test model",
158
+ author="Test Author",
159
+ ingestion_date="2023-01-01"
160
+ )
161
+
162
+ result = add_media_with_keywords(
163
+ url="http://example.com",
164
+ title="Updated Title",
165
+ media_type="article",
166
+ content="Updated content",
167
+ keywords="test,new",
168
+ prompt="Updated prompt",
169
+ summary="Updated summary",
170
+ transcription_model="Updated model",
171
+ author="Updated Author",
172
+ ingestion_date="2023-01-02"
173
+ )
174
+
175
+ self.assertIn("added/updated successfully", result)
176
+
177
+ with self.db.get_connection() as conn:
178
+ cursor = conn.cursor()
179
+ cursor.execute("SELECT COUNT(*) FROM Media")
180
+ self.assertEqual(cursor.fetchone()[0], 1)
181
+
182
+ cursor.execute("SELECT title FROM Media")
183
+ self.assertEqual(cursor.fetchone()[0], "Updated Title")
184
+
185
+ cursor.execute("SELECT COUNT(*) FROM Keywords")
186
+ self.assertEqual(cursor.fetchone()[0], 3)
187
+
188
+ cursor.execute("SELECT COUNT(*) FROM MediaKeywords")
189
+ self.assertEqual(cursor.fetchone()[0], 3)
190
+
191
+ cursor.execute("SELECT COUNT(*) FROM MediaModifications")
192
+ self.assertEqual(cursor.fetchone()[0], 2)
193
+
194
+ cursor.execute("SELECT COUNT(*) FROM MediaVersion")
195
+ self.assertEqual(cursor.fetchone()[0], 2)
196
+
197
+ if __name__ == '__main__':
198
+ unittest.main()
199
+
200
+ #
201
+ # End of File
202
+ #######################################################################################################################
App_Function_Libraries/Gradio_Related.py CHANGED
@@ -1,375 +1,383 @@
1
- # Gradio_Related.py
2
- #########################################
3
- # Gradio UI Functions Library
4
- # I fucking hate Gradio.
5
- # Yea, fuck Gradio. https://github.com/gradio-app/gradio/pull/8263 & https://github.com/gradio-app/gradio/issues/7968
6
- #
7
- #########################################
8
- #
9
- # Built-In Imports
10
- import logging
11
- import os
12
- #
13
- # Import 3rd-Party Libraries
14
- import gradio as gr
15
- #
16
- # Local Imports
17
- from App_Function_Libraries.DB.DB_Manager import get_db_config
18
- from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
19
- from App_Function_Libraries.Gradio_UI.Character_Interaction_tab import create_character_card_interaction_tab, \
20
- create_multiple_character_chat_tab, create_narrator_controlled_conversation_tab
21
- from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
22
- create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
23
- from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
24
- from App_Function_Libraries.Gradio_UI.Explain_summarize_tab import create_summarize_explain_tab
25
- from App_Function_Libraries.Gradio_UI.Export_Functionality import create_export_tab
26
- from App_Function_Libraries.Gradio_UI.Backup_Functionality import create_backup_tab, create_view_backups_tab, \
27
- create_restore_backup_tab
28
- from App_Function_Libraries.Gradio_UI.Import_Functionality import create_import_single_prompt_tab, \
29
- create_import_obsidian_vault_tab, create_import_item_tab, create_import_book_tab, create_import_multiple_prompts_tab
30
- from App_Function_Libraries.Gradio_UI.Introduction_tab import create_introduction_tab
31
- from App_Function_Libraries.Gradio_UI.Keywords import create_view_keywords_tab, create_add_keyword_tab, \
32
- create_delete_keyword_tab, create_export_keywords_tab
33
- #from App_Function_Libraries.Gradio_UI.Live_Recording import create_live_recording_tab
34
- from App_Function_Libraries.Gradio_UI.Llamafile_tab import create_chat_with_llamafile_tab
35
- from App_Function_Libraries.Gradio_UI.Media_edit import create_prompt_clone_tab, create_prompt_edit_tab, \
36
- create_media_edit_and_clone_tab, create_media_edit_tab
37
- from App_Function_Libraries.Gradio_UI.Media_wiki_tab import create_mediawiki_import_tab, create_mediawiki_config_tab
38
- from App_Function_Libraries.Gradio_UI.PDF_ingestion_tab import create_pdf_ingestion_tab, create_pdf_ingestion_test_tab
39
- from App_Function_Libraries.Gradio_UI.Podcast_tab import create_podcast_tab
40
- from App_Function_Libraries.Gradio_UI.RAG_QA_Chat_tab import create_rag_qa_chat_tab
41
- from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
42
- from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \
43
- create_search_summaries_tab, create_viewing_tab, create_search_tab
44
- from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab
45
- from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \
46
- create_purge_embeddings_tab
47
- from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
48
- create_delete_trash_tab, create_search_and_mark_trash_tab
49
- from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
50
- create_utilities_yt_video_tab
51
- from App_Function_Libraries.Gradio_UI.Video_transcription_tab import create_video_transcription_tab
52
- from App_Function_Libraries.Gradio_UI.View_tab import create_manage_items_tab
53
- from App_Function_Libraries.Gradio_UI.Website_scraping_tab import create_website_scraping_tab
54
- from App_Function_Libraries.Gradio_UI.Chat_Workflows import chat_workflows_tab
55
- #
56
- # Gradio UI Imports
57
- from App_Function_Libraries.Gradio_UI.Evaluations_Benchmarks_tab import create_geval_tab, create_infinite_bench_tab
58
-
59
- #
60
- #######################################################################################################################
61
- # Function Definitions
62
- #
63
-
64
-
65
- # Disable Gradio Analytics
66
- os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
67
-
68
-
69
- custom_prompt_input = None
70
- server_mode = False
71
- share_public = False
72
- custom_prompt_summarize_bulleted_notes = ("""
73
- <s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
74
- **Bulleted Note Creation Guidelines**
75
-
76
- **Headings**:
77
- - Based on referenced topics, not categories like quotes or terms
78
- - Surrounded by **bold** formatting
79
- - Not listed as bullet points
80
- - No space between headings and list items underneath
81
-
82
- **Emphasis**:
83
- - **Important terms** set in bold font
84
- - **Text ending in a colon**: also bolded
85
-
86
- **Review**:
87
- - Ensure adherence to specified format
88
- - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
89
- """)
90
- #
91
- # End of globals
92
- #######################################################################################################################
93
- #
94
- # Start of Video/Audio Transcription and Summarization Functions
95
- #
96
- # Functions:
97
- # FIXME
98
- #
99
- #
100
- ################################################################################################################
101
- # Functions for Re-Summarization
102
- #
103
- # Functions:
104
- # FIXME
105
- # End of Re-Summarization Functions
106
- #
107
- ############################################################################################################################################################################################################################
108
- #
109
- # Explain/Summarize This Tab
110
- #
111
- # Functions:
112
- # FIXME
113
- #
114
- #
115
- ############################################################################################################################################################################################################################
116
- #
117
- # Transcript Comparison Tab
118
- #
119
- # Functions:
120
- # FIXME
121
- #
122
- #
123
- ###########################################################################################################################################################################################################################
124
- #
125
- # Search Tab
126
- #
127
- # Functions:
128
- # FIXME
129
- #
130
- # End of Search Tab Functions
131
- #
132
- ##############################################################################################################################################################################################################################
133
- #
134
- # Llamafile Tab
135
- #
136
- # Functions:
137
- # FIXME
138
- #
139
- # End of Llamafile Tab Functions
140
- ##############################################################################################################################################################################################################################
141
- #
142
- # Chat Interface Tab Functions
143
- #
144
- # Functions:
145
- # FIXME
146
- #
147
- #
148
- # End of Chat Interface Tab Functions
149
- ################################################################################################################################################################################################################################
150
- #
151
- # Media Edit Tab Functions
152
- # Functions:
153
- # Fixme
154
- # create_media_edit_tab():
155
- ##### Trash Tab
156
- # FIXME
157
- # Functions:
158
- #
159
- # End of Media Edit Tab Functions
160
- ################################################################################################################
161
- #
162
- # Import Items Tab Functions
163
- #
164
- # Functions:
165
- #FIXME
166
- # End of Import Items Tab Functions
167
- ################################################################################################################
168
- #
169
- # Export Items Tab Functions
170
- #
171
- # Functions:
172
- # FIXME
173
- #
174
- #
175
- # End of Export Items Tab Functions
176
- ################################################################################################################
177
- #
178
- # Keyword Management Tab Functions
179
- #
180
- # Functions:
181
- # create_view_keywords_tab():
182
- # FIXME
183
- #
184
- # End of Keyword Management Tab Functions
185
- ################################################################################################################
186
- #
187
- # Document Editing Tab Functions
188
- #
189
- # Functions:
190
- # #FIXME
191
- #
192
- #
193
- ################################################################################################################
194
- #
195
- # Utilities Tab Functions
196
- # Functions:
197
- # create_utilities_yt_video_tab():
198
- # #FIXME
199
-
200
- #
201
- # End of Utilities Tab Functions
202
- ################################################################################################################
203
-
204
- # FIXME - Prompt sample box
205
- #
206
- # # Sample data
207
- # prompts_category_1 = [
208
- # "What are the key points discussed in the video?",
209
- # "Summarize the main arguments made by the speaker.",
210
- # "Describe the conclusions of the study presented."
211
- # ]
212
- #
213
- # prompts_category_2 = [
214
- # "How does the proposed solution address the problem?",
215
- # "What are the implications of the findings?",
216
- # "Can you explain the theory behind the observed phenomenon?"
217
- # ]
218
- #
219
- # all_prompts2 = prompts_category_1 + prompts_category_2
220
-
221
-
222
- def launch_ui(share_public=None, server_mode=False):
223
- share=share_public
224
- css = """
225
- .result-box {
226
- margin-bottom: 20px;
227
- border: 1px solid #ddd;
228
- padding: 10px;
229
- }
230
- .result-box.error {
231
- border-color: #ff0000;
232
- background-color: #ffeeee;
233
- }
234
- .transcription, .summary {
235
- max-height: 300px;
236
- overflow-y: auto;
237
- border: 1px solid #eee;
238
- padding: 10px;
239
- margin-top: 10px;
240
- }
241
- """
242
-
243
- with gr.Blocks(theme='bethecloud/storj_theme',css=css) as iface:
244
- db_config = get_db_config()
245
- db_type = db_config['type']
246
- gr.Markdown(f"# tl/dw: Your LLM-powered Research Multi-tool")
247
- gr.Markdown(f"(Using {db_type.capitalize()} Database)")
248
- with gr.Tabs():
249
- with gr.TabItem("Transcription / Summarization / Ingestion"):
250
- with gr.Tabs():
251
- create_video_transcription_tab()
252
- create_audio_processing_tab()
253
- create_podcast_tab()
254
- create_import_book_tab()
255
- create_website_scraping_tab()
256
- create_pdf_ingestion_tab()
257
- create_pdf_ingestion_test_tab()
258
- create_resummary_tab()
259
- create_summarize_explain_tab()
260
- #create_live_recording_tab()
261
-
262
- with gr.TabItem("Text Search "):
263
- create_search_tab()
264
- create_search_summaries_tab()
265
-
266
- with gr.TabItem("RAG Search"):
267
- create_rag_tab()
268
- create_rag_qa_chat_tab()
269
-
270
- with gr.TabItem("Chat with an LLM"):
271
- create_chat_interface()
272
- create_chat_interface_stacked()
273
- create_chat_interface_multi_api()
274
- create_chat_interface_four()
275
- create_chat_with_llamafile_tab()
276
- create_chat_management_tab()
277
- chat_workflows_tab()
278
- create_multiple_character_chat_tab()
279
- create_narrator_controlled_conversation_tab()
280
- create_character_card_interaction_tab()
281
-
282
-
283
-
284
- with gr.TabItem("View DB Items"):
285
- create_viewing_tab()
286
- create_prompt_view_tab()
287
-
288
- with gr.TabItem("Prompts"):
289
- create_prompt_view_tab()
290
- create_prompt_search_tab()
291
- create_prompt_edit_tab()
292
- create_prompt_clone_tab()
293
-
294
-
295
- with gr.TabItem("Manage / Edit Existing Items"):
296
- create_media_edit_tab()
297
- create_manage_items_tab()
298
- create_media_edit_and_clone_tab()
299
- # FIXME
300
- #create_compare_transcripts_tab()
301
-
302
- with gr.TabItem("Embeddings Management"):
303
- create_embeddings_tab()
304
- create_view_embeddings_tab()
305
- create_purge_embeddings_tab()
306
-
307
- with gr.TabItem("Writing Tools"):
308
- with gr.Tabs():
309
- from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
310
- create_document_feedback_tab()
311
- from App_Function_Libraries.Gradio_UI.Writing_tab import create_grammar_style_check_tab
312
- create_grammar_style_check_tab()
313
- from App_Function_Libraries.Gradio_UI.Writing_tab import create_tone_adjustment_tab
314
- create_tone_adjustment_tab()
315
- from App_Function_Libraries.Gradio_UI.Writing_tab import create_creative_writing_tab
316
- create_creative_writing_tab()
317
- from App_Function_Libraries.Gradio_UI.Writing_tab import create_mikupad_tab
318
- create_mikupad_tab()
319
-
320
-
321
- with gr.TabItem("Keywords"):
322
- create_view_keywords_tab()
323
- create_add_keyword_tab()
324
- create_delete_keyword_tab()
325
- create_export_keywords_tab()
326
-
327
- with gr.TabItem("Import"):
328
- create_import_item_tab()
329
- create_import_obsidian_vault_tab()
330
- create_import_single_prompt_tab()
331
- create_import_multiple_prompts_tab()
332
- create_mediawiki_import_tab()
333
- create_mediawiki_config_tab()
334
-
335
- with gr.TabItem("Export"):
336
- create_export_tab()
337
-
338
- with gr.TabItem("Backup Management"):
339
- create_backup_tab()
340
- create_view_backups_tab()
341
- create_restore_backup_tab()
342
-
343
- with gr.TabItem("Utilities"):
344
- create_utilities_yt_video_tab()
345
- create_utilities_yt_audio_tab()
346
- create_utilities_yt_timestamp_tab()
347
-
348
- with gr.TabItem("Trashcan"):
349
- create_search_and_mark_trash_tab()
350
- create_view_trash_tab()
351
- create_delete_trash_tab()
352
- create_empty_trash_tab()
353
-
354
- with gr.TabItem("Evaluations"):
355
- create_geval_tab()
356
- create_infinite_bench_tab()
357
-
358
- with gr.TabItem("Introduction/Help"):
359
- create_introduction_tab()
360
-
361
- with gr.TabItem("Config Editor"):
362
- create_config_editor_tab()
363
-
364
- # Launch the interface
365
- server_port_variable = 7860
366
- os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
367
- if share==True:
368
- iface.launch(share=True)
369
- elif server_mode and not share_public:
370
- iface.launch(share=False, server_name="0.0.0.0", server_port=server_port_variable, )
371
- else:
372
- try:
373
- iface.launch(share=False, server_name="0.0.0.0", server_port=server_port_variable, )
374
- except Exception as e:
375
- logging.error(f"Error launching interface: {str(e)}")
 
 
 
 
 
 
 
 
 
1
+ # Gradio_Related.py
2
+ #########################################
3
+ # Gradio UI Functions Library
4
+ # I fucking hate Gradio.
5
+ # Yea, fuck Gradio. https://github.com/gradio-app/gradio/pull/8263 & https://github.com/gradio-app/gradio/issues/7968
6
+ #
7
+ #########################################
8
+ #
9
+ # Built-In Imports
10
+ import logging
11
+ import os
12
+ #
13
+ # Import 3rd-Party Libraries
14
+ import gradio as gr
15
+ #
16
+ # Local Imports
17
+ from App_Function_Libraries.DB.DB_Manager import get_db_config
18
+ from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
19
+ from App_Function_Libraries.Gradio_UI.Book_Ingestion_tab import create_import_book_tab
20
+ from App_Function_Libraries.Gradio_UI.Character_Interaction_tab import create_character_card_interaction_tab, \
21
+ create_multiple_character_chat_tab, create_narrator_controlled_conversation_tab
22
+ from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
23
+ create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
24
+ from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
25
+ from App_Function_Libraries.Gradio_UI.Explain_summarize_tab import create_summarize_explain_tab
26
+ from App_Function_Libraries.Gradio_UI.Export_Functionality import create_export_tab
27
+ from App_Function_Libraries.Gradio_UI.Backup_Functionality import create_backup_tab, create_view_backups_tab, \
28
+ create_restore_backup_tab
29
+ from App_Function_Libraries.Gradio_UI.Import_Functionality import create_import_single_prompt_tab, \
30
+ create_import_obsidian_vault_tab, create_import_item_tab, create_import_multiple_prompts_tab
31
+ from App_Function_Libraries.Gradio_UI.Introduction_tab import create_introduction_tab
32
+ from App_Function_Libraries.Gradio_UI.Keywords import create_view_keywords_tab, create_add_keyword_tab, \
33
+ create_delete_keyword_tab, create_export_keywords_tab
34
+ from App_Function_Libraries.Gradio_UI.Live_Recording import create_live_recording_tab
35
+ from App_Function_Libraries.Gradio_UI.Llamafile_tab import create_chat_with_llamafile_tab
36
+ from App_Function_Libraries.Gradio_UI.MMLU_Pro_tab import create_mmlu_pro_tab
37
+ from App_Function_Libraries.Gradio_UI.Media_edit import create_prompt_clone_tab, create_prompt_edit_tab, \
38
+ create_media_edit_and_clone_tab, create_media_edit_tab
39
+ from App_Function_Libraries.Gradio_UI.Media_wiki_tab import create_mediawiki_import_tab, create_mediawiki_config_tab
40
+ from App_Function_Libraries.Gradio_UI.PDF_ingestion_tab import create_pdf_ingestion_tab, create_pdf_ingestion_test_tab
41
+ from App_Function_Libraries.Gradio_UI.Plaintext_tab_import import create_plain_text_import_tab
42
+ from App_Function_Libraries.Gradio_UI.Podcast_tab import create_podcast_tab
43
+ from App_Function_Libraries.Gradio_UI.RAG_QA_Chat_tab import create_rag_qa_chat_tab
44
+ from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
45
+ from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_search_tab, \
46
+ create_search_summaries_tab, create_search_tab
47
+ from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab
48
+ from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \
49
+ create_purge_embeddings_tab
50
+ from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
51
+ create_delete_trash_tab, create_search_and_mark_trash_tab
52
+ from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
53
+ create_utilities_yt_video_tab
54
+ from App_Function_Libraries.Gradio_UI.Video_transcription_tab import create_video_transcription_tab
55
+ from App_Function_Libraries.Gradio_UI.View_tab import create_manage_items_tab
56
+ from App_Function_Libraries.Gradio_UI.Website_scraping_tab import create_website_scraping_tab
57
+ from App_Function_Libraries.Gradio_UI.Chat_Workflows import chat_workflows_tab
58
+ from App_Function_Libraries.Gradio_UI.View_DB_Items_tab import create_prompt_view_tab, create_viewing_tab, \
59
+ create_view_all_with_versions_tab, create_viewing_tab
60
+ #
61
+ # Gradio UI Imports
62
+ from App_Function_Libraries.Gradio_UI.Evaluations_Benchmarks_tab import create_geval_tab, create_infinite_bench_tab
63
+
64
+ #
65
+ #######################################################################################################################
66
+ # Function Definitions
67
+ #
68
+
69
+
70
+ # Disable Gradio Analytics
71
+ os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
72
+
73
+
74
+ custom_prompt_input = None
75
+ server_mode = False
76
+ share_public = False
77
+ custom_prompt_summarize_bulleted_notes = ("""
78
+ <s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
79
+ **Bulleted Note Creation Guidelines**
80
+
81
+ **Headings**:
82
+ - Based on referenced topics, not categories like quotes or terms
83
+ - Surrounded by **bold** formatting
84
+ - Not listed as bullet points
85
+ - No space between headings and list items underneath
86
+
87
+ **Emphasis**:
88
+ - **Important terms** set in bold font
89
+ - **Text ending in a colon**: also bolded
90
+
91
+ **Review**:
92
+ - Ensure adherence to specified format
93
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
94
+ """)
95
+ #
96
+ # End of globals
97
+ #######################################################################################################################
98
+ #
99
+ # Start of Video/Audio Transcription and Summarization Functions
100
+ #
101
+ # Functions:
102
+ # FIXME
103
+ #
104
+ #
105
+ ################################################################################################################
106
+ # Functions for Re-Summarization
107
+ #
108
+ # Functions:
109
+ # FIXME
110
+ # End of Re-Summarization Functions
111
+ #
112
+ ############################################################################################################################################################################################################################
113
+ #
114
+ # Explain/Summarize This Tab
115
+ #
116
+ # Functions:
117
+ # FIXME
118
+ #
119
+ #
120
+ ############################################################################################################################################################################################################################
121
+ #
122
+ # Transcript Comparison Tab
123
+ #
124
+ # Functions:
125
+ # FIXME
126
+ #
127
+ #
128
+ ###########################################################################################################################################################################################################################
129
+ #
130
+ # Search Tab
131
+ #
132
+ # Functions:
133
+ # FIXME
134
+ #
135
+ # End of Search Tab Functions
136
+ #
137
+ ##############################################################################################################################################################################################################################
138
+ #
139
+ # Llamafile Tab
140
+ #
141
+ # Functions:
142
+ # FIXME
143
+ #
144
+ # End of Llamafile Tab Functions
145
+ ##############################################################################################################################################################################################################################
146
+ #
147
+ # Chat Interface Tab Functions
148
+ #
149
+ # Functions:
150
+ # FIXME
151
+ #
152
+ #
153
+ # End of Chat Interface Tab Functions
154
+ ################################################################################################################################################################################################################################
155
+ #
156
+ # Media Edit Tab Functions
157
+ # Functions:
158
+ # Fixme
159
+ # create_media_edit_tab():
160
+ ##### Trash Tab
161
+ # FIXME
162
+ # Functions:
163
+ #
164
+ # End of Media Edit Tab Functions
165
+ ################################################################################################################
166
+ #
167
+ # Import Items Tab Functions
168
+ #
169
+ # Functions:
170
+ #FIXME
171
+ # End of Import Items Tab Functions
172
+ ################################################################################################################
173
+ #
174
+ # Export Items Tab Functions
175
+ #
176
+ # Functions:
177
+ # FIXME
178
+ #
179
+ #
180
+ # End of Export Items Tab Functions
181
+ ################################################################################################################
182
+ #
183
+ # Keyword Management Tab Functions
184
+ #
185
+ # Functions:
186
+ # create_view_keywords_tab():
187
+ # FIXME
188
+ #
189
+ # End of Keyword Management Tab Functions
190
+ ################################################################################################################
191
+ #
192
+ # Document Editing Tab Functions
193
+ #
194
+ # Functions:
195
+ # #FIXME
196
+ #
197
+ #
198
+ ################################################################################################################
199
+ #
200
+ # Utilities Tab Functions
201
+ # Functions:
202
+ # create_utilities_yt_video_tab():
203
+ # #FIXME
204
+
205
+ #
206
+ # End of Utilities Tab Functions
207
+ ################################################################################################################
208
+
209
+ # FIXME - Prompt sample box
210
+ #
211
+ # # Sample data
212
+ # prompts_category_1 = [
213
+ # "What are the key points discussed in the video?",
214
+ # "Summarize the main arguments made by the speaker.",
215
+ # "Describe the conclusions of the study presented."
216
+ # ]
217
+ #
218
+ # prompts_category_2 = [
219
+ # "How does the proposed solution address the problem?",
220
+ # "What are the implications of the findings?",
221
+ # "Can you explain the theory behind the observed phenomenon?"
222
+ # ]
223
+ #
224
+ # all_prompts2 = prompts_category_1 + prompts_category_2
225
+
226
+
227
+ def launch_ui(share_public=None, server_mode=False):
228
+ share=share_public
229
+ css = """
230
+ .result-box {
231
+ margin-bottom: 20px;
232
+ border: 1px solid #ddd;
233
+ padding: 10px;
234
+ }
235
+ .result-box.error {
236
+ border-color: #ff0000;
237
+ background-color: #ffeeee;
238
+ }
239
+ .transcription, .summary {
240
+ max-height: 300px;
241
+ overflow-y: auto;
242
+ border: 1px solid #eee;
243
+ padding: 10px;
244
+ margin-top: 10px;
245
+ }
246
+ """
247
+
248
+ with gr.Blocks(theme='bethecloud/storj_theme',css=css) as iface:
249
+ db_config = get_db_config()
250
+ db_type = db_config['type']
251
+ gr.Markdown(f"# tl/dw: Your LLM-powered Research Multi-tool")
252
+ gr.Markdown(f"(Using {db_type.capitalize()} Database)")
253
+ with gr.Tabs():
254
+ with gr.TabItem("Transcription / Summarization / Ingestion"):
255
+ with gr.Tabs():
256
+ create_video_transcription_tab()
257
+ create_audio_processing_tab()
258
+ create_podcast_tab()
259
+ create_import_book_tab()
260
+ create_plain_text_import_tab()
261
+ create_website_scraping_tab()
262
+ create_pdf_ingestion_tab()
263
+ create_pdf_ingestion_test_tab()
264
+ create_resummary_tab()
265
+ create_summarize_explain_tab()
266
+ create_live_recording_tab()
267
+
268
+ with gr.TabItem("Text Search "):
269
+ create_search_tab()
270
+ create_search_summaries_tab()
271
+
272
+ with gr.TabItem("RAG Search"):
273
+ create_rag_tab()
274
+ create_rag_qa_chat_tab()
275
+
276
+ with gr.TabItem("Chat with an LLM"):
277
+ create_chat_interface()
278
+ create_chat_interface_stacked()
279
+ create_chat_interface_multi_api()
280
+ create_chat_interface_four()
281
+ create_chat_with_llamafile_tab()
282
+ create_chat_management_tab()
283
+ chat_workflows_tab()
284
+ create_multiple_character_chat_tab()
285
+ create_narrator_controlled_conversation_tab()
286
+ create_character_card_interaction_tab()
287
+
288
+ with gr.TabItem("View DB Items"):
289
+ # This one works
290
+ create_view_all_with_versions_tab()
291
+ # This one is WIP
292
+ create_viewing_tab()
293
+ create_prompt_view_tab()
294
+
295
+ with gr.TabItem("Prompts"):
296
+ create_prompt_view_tab()
297
+ create_prompt_search_tab()
298
+ create_prompt_edit_tab()
299
+ create_prompt_clone_tab()
300
+
301
+
302
+ with gr.TabItem("Manage / Edit Existing Items"):
303
+ create_media_edit_tab()
304
+ create_manage_items_tab()
305
+ create_media_edit_and_clone_tab()
306
+ # FIXME
307
+ #create_compare_transcripts_tab()
308
+
309
+ with gr.TabItem("Embeddings Management"):
310
+ create_embeddings_tab()
311
+ create_view_embeddings_tab()
312
+ create_purge_embeddings_tab()
313
+
314
+ with gr.TabItem("Writing Tools"):
315
+ with gr.Tabs():
316
+ from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
317
+ create_document_feedback_tab()
318
+ from App_Function_Libraries.Gradio_UI.Writing_tab import create_grammar_style_check_tab
319
+ create_grammar_style_check_tab()
320
+ from App_Function_Libraries.Gradio_UI.Writing_tab import create_tone_adjustment_tab
321
+ create_tone_adjustment_tab()
322
+ from App_Function_Libraries.Gradio_UI.Writing_tab import create_creative_writing_tab
323
+ create_creative_writing_tab()
324
+ from App_Function_Libraries.Gradio_UI.Writing_tab import create_mikupad_tab
325
+ create_mikupad_tab()
326
+
327
+
328
+ with gr.TabItem("Keywords"):
329
+ create_view_keywords_tab()
330
+ create_add_keyword_tab()
331
+ create_delete_keyword_tab()
332
+ create_export_keywords_tab()
333
+
334
+ with gr.TabItem("Import"):
335
+ create_import_item_tab()
336
+ create_import_obsidian_vault_tab()
337
+ create_import_single_prompt_tab()
338
+ create_import_multiple_prompts_tab()
339
+ create_mediawiki_import_tab()
340
+ create_mediawiki_config_tab()
341
+
342
+ with gr.TabItem("Export"):
343
+ create_export_tab()
344
+
345
+ with gr.TabItem("Backup Management"):
346
+ create_backup_tab()
347
+ create_view_backups_tab()
348
+ create_restore_backup_tab()
349
+
350
+ with gr.TabItem("Utilities"):
351
+ create_utilities_yt_video_tab()
352
+ create_utilities_yt_audio_tab()
353
+ create_utilities_yt_timestamp_tab()
354
+
355
+ with gr.TabItem("Trashcan"):
356
+ create_search_and_mark_trash_tab()
357
+ create_view_trash_tab()
358
+ create_delete_trash_tab()
359
+ create_empty_trash_tab()
360
+
361
+ with gr.TabItem("Evaluations"):
362
+ create_geval_tab()
363
+ create_infinite_bench_tab()
364
+ create_mmlu_pro_tab()
365
+
366
+ with gr.TabItem("Introduction/Help"):
367
+ create_introduction_tab()
368
+
369
+ with gr.TabItem("Config Editor"):
370
+ create_config_editor_tab()
371
+
372
+ # Launch the interface
373
+ server_port_variable = 7860
374
+ os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
375
+ if share==True:
376
+ iface.launch(share=True)
377
+ elif server_mode and not share_public:
378
+ iface.launch(share=False, server_name="0.0.0.0", server_port=server_port_variable, )
379
+ else:
380
+ try:
381
+ iface.launch(share=False, server_name="0.0.0.0", server_port=server_port_variable, )
382
+ except Exception as e:
383
+ logging.error(f"Error launching interface: {str(e)}")
App_Function_Libraries/MediaWiki/Media_Wiki.py CHANGED
@@ -23,7 +23,8 @@ from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_conten
23
  # Functions:
24
  # Load configuration
25
  def load_mediawiki_import_config():
26
- with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
 
27
  return yaml.safe_load(f)
28
 
29
  config = load_mediawiki_import_config()
 
23
  # Functions:
24
  # Load configuration
25
  def load_mediawiki_import_config():
26
+ config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'Config_Files', 'mediawiki_import_config.yaml')
27
+ with open(config_path, 'r') as f:
28
  return yaml.safe_load(f)
29
 
30
  config = load_mediawiki_import_config()
App_Function_Libraries/Plaintext/Plaintext_Files.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Plaintext_Files.py
2
+ # Description: This file contains functions for reading and writing plaintext files.
3
+ #
4
+ # Import necessary libraries
5
+ import os
6
+ import re
7
+ from datetime import datetime
8
+ import logging
9
+ import tempfile
10
+ import zipfile
11
+ #
12
+ # Non-Local Imports
13
+ #
14
+ # Local Imports
15
+ #
16
+ #######################################################################################################################
17
+ #
18
+ # Function Definitions
App_Function_Libraries/RAG/CRAG_Pipeline.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # First gen
2
+
3
+ # Install the necessary libraries
4
+ # !pip install transformers
5
+ # !pip install sentence-transformers
6
+ # !pip install torch
7
+ # !pip install requests
8
+ # !pip install bs4
9
+
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
13
+ from sentence_transformers import SentenceTransformer, util
14
+ import torch
15
+
16
+ # Step 1: Load Models for Summarization and Similarity
17
+ model_name = "facebook/bart-large-cnn" # Summarization model
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
20
+
21
+ # Summarization pipeline
22
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
23
+
24
+ # Sentence similarity model
25
+ similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
26
+
27
+
28
+ # Step 2: Define Retrieval Evaluator
29
+ def evaluate_retrieval(query, retrieved_docs):
30
+ """
31
+ Evaluate the relevance of retrieved documents using cosine similarity
32
+ with sentence embeddings.
33
+ """
34
+ query_embedding = similarity_model.encode(query, convert_to_tensor=True)
35
+ doc_embeddings = similarity_model.encode(retrieved_docs, convert_to_tensor=True)
36
+
37
+ # Calculate cosine similarity between the query and each document
38
+ similarities = [util.pytorch_cos_sim(query_embedding, doc_embedding).item() for doc_embedding in doc_embeddings]
39
+
40
+ # Set a threshold for relevance (adjustable)
41
+ relevance_threshold = 0.5
42
+ relevance_scores = ['Correct' if sim > relevance_threshold else 'Incorrect' for sim in similarities]
43
+
44
+ return relevance_scores
45
+
46
+
47
+ # Step 3: Knowledge Refinement (Decompose-then-Recompose)
48
+ def decompose_then_recompose(retrieved_docs):
49
+ """
50
+ Refine the retrieved documents by summarizing their key information.
51
+ """
52
+ refined_knowledge = []
53
+ for doc in retrieved_docs:
54
+ summary = summarizer(doc, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
55
+ refined_knowledge.append(summary)
56
+ return refined_knowledge
57
+
58
+
59
+ # Step 4: Web Search for External Knowledge
60
+ def web_search(query):
61
+ """
62
+ Perform a web search to retrieve additional external knowledge if the
63
+ retrieved documents are not relevant.
64
+ """
65
+ search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
66
+ headers = {'User-Agent': 'Mozilla/5.0'}
67
+ response = requests.get(search_url, headers=headers)
68
+ soup = BeautifulSoup(response.text, 'html.parser')
69
+
70
+ # Extract URLs from search results (simplified)
71
+ links = []
72
+ for item in soup.find_all('a'):
73
+ link = item.get('href')
74
+ if link and "http" in link:
75
+ links.append(link)
76
+ return links[:5] # Return the first 5 URLs
77
+
78
+
79
+ # Step 5: Generate Final Output
80
+ def generate_final_output(query, refined_knowledge):
81
+ """
82
+ Generate the final output summary using the refined knowledge.
83
+ """
84
+ combined_knowledge = " ".join(refined_knowledge)
85
+ final_summary = summarizer(combined_knowledge, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
86
+ return final_summary
87
+
88
+
89
+ # Step 6: CRAG Workflow Integration
90
+ def crag_workflow(query, retrieved_docs):
91
+ """
92
+ Full CRAG workflow integrating evaluation, knowledge refinement,
93
+ and web search to generate a robust output summary.
94
+ """
95
+ # Step 1: Evaluate retrieval
96
+ relevance_scores = evaluate_retrieval(query, retrieved_docs)
97
+
98
+ if 'Correct' in relevance_scores:
99
+ # Step 2: Decompose-then-Recompose for correct documents
100
+ refined_knowledge = decompose_then_recompose(
101
+ [doc for doc, score in zip(retrieved_docs, relevance_scores) if score == 'Correct'])
102
+ else:
103
+ # Step 3: Web search if retrieval is incorrect
104
+ web_results = web_search(query)
105
+ refined_knowledge = decompose_then_recompose(web_results)
106
+
107
+ # Step 4: Generate final output
108
+ final_summary = generate_final_output(query, refined_knowledge)
109
+
110
+ return final_summary
111
+
112
+
113
+ # Example Usage
114
+ if __name__ == "__main__":
115
+ # Example query and retrieved documents
116
+ query = "What are the latest advancements in renewable energy?"
117
+ retrieved_docs = [
118
+ "Renewable energy is becoming increasingly important in today's world...",
119
+ "Solar energy has seen significant advancements in the past decade...",
120
+ "Wind energy technology is rapidly evolving, with new innovations expected soon..."
121
+ ]
122
+
123
+ # Perform the CRAG workflow
124
+ final_summary = crag_workflow(query, retrieved_docs)
125
+ print("Final Summary:", final_summary)
App_Function_Libraries/RAG/ChromaDB_Library.py CHANGED
@@ -1,290 +1,244 @@
1
- import configparser
2
- import logging
3
- import sqlite3
4
- from typing import List, Dict, Any
5
-
6
- import chromadb
7
- import requests
8
- from chromadb import Settings
9
-
10
- from App_Function_Libraries.Chunk_Lib import improved_chunking_process
11
- from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media
12
- from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
13
-
14
- #######################################################################################################################
15
- #
16
- # Functions for ChromaDB
17
-
18
- # Get ChromaDB settings
19
- # Load configuration
20
- config = configparser.ConfigParser()
21
- config.read('config.txt')
22
- chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
23
- chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
24
-
25
- import os
26
- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
27
-
28
- # Get embedding settings
29
- embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
30
- embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
31
- embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
32
- embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
33
-
34
- # Get chunking options
35
- chunk_options = {
36
- 'method': config.get('Chunking', 'method', fallback='words'),
37
- 'max_size': config.getint('Chunking', 'max_size', fallback=400),
38
- 'overlap': config.getint('Chunking', 'overlap', fallback=200),
39
- 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
40
- 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
41
- 'language': config.get('Chunking', 'language', fallback='english')
42
- }
43
-
44
-
45
- def auto_update_chroma_embeddings(media_id: int, content: str):
46
- """
47
- Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
48
-
49
- :param media_id: The ID of the newly ingested media item
50
- :param content: The content of the newly ingested media item
51
- """
52
- collection_name = f"media_{media_id}"
53
-
54
- # Initialize or get the ChromaDB collection
55
- collection = chroma_client.get_or_create_collection(name=collection_name)
56
-
57
- # Check if embeddings already exist for this media_id
58
- existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
59
-
60
- if existing_embeddings and len(existing_embeddings) > 0:
61
- logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
62
- else:
63
- # Process and store content if embeddings do not already exist
64
- process_and_store_content(content, collection_name, media_id)
65
- logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
66
-
67
-
68
- # Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
69
- def process_and_store_content(content: str, collection_name: str, media_id: int):
70
- # Process the content into chunks
71
- chunks = improved_chunking_process(content, chunk_options)
72
- texts = [chunk['text'] for chunk in chunks]
73
-
74
- # Generate embeddings for each chunk
75
- embeddings = [create_embedding(text) for text in texts]
76
-
77
- # Create unique IDs for each chunk using the media_id and chunk index
78
- ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
79
-
80
- # Store the texts, embeddings, and IDs in ChromaDB
81
- store_in_chroma(collection_name, texts, embeddings, ids)
82
-
83
- # Store the chunk metadata in SQLite
84
- for i, chunk in enumerate(chunks):
85
- add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
86
-
87
- # Update the FTS table
88
- update_fts_for_media(media_id)
89
-
90
- # Function to store documents and their embeddings in ChromaDB
91
- def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
92
- collection = chroma_client.get_or_create_collection(name=collection_name)
93
- collection.add(
94
- documents=texts,
95
- embeddings=embeddings,
96
- ids=ids
97
- )
98
-
99
- # Function to perform vector search using ChromaDB
100
- def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
101
- query_embedding = create_embedding(query)
102
- collection = chroma_client.get_collection(name=collection_name)
103
- results = collection.query(
104
- query_embeddings=[query_embedding],
105
- n_results=k
106
- )
107
- return results['documents'][0]
108
-
109
-
110
- def create_embedding(text: str) -> List[float]:
111
- global embedding_provider, embedding_model, embedding_api_url, embedding_api_key
112
-
113
- if embedding_provider == 'openai':
114
- return get_openai_embeddings(text, embedding_model)
115
- elif embedding_provider == 'local':
116
- response = requests.post(
117
- embedding_api_url,
118
- json={"text": text, "model": embedding_model},
119
- headers={"Authorization": f"Bearer {embedding_api_key}"}
120
- )
121
- return response.json()['embedding']
122
- elif embedding_provider == 'huggingface':
123
- from transformers import AutoTokenizer, AutoModel
124
- import torch
125
-
126
- tokenizer = AutoTokenizer.from_pretrained(embedding_model)
127
- model = AutoModel.from_pretrained(embedding_model)
128
-
129
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
130
- with torch.no_grad():
131
- outputs = model(**inputs)
132
-
133
- # Use the mean of the last hidden state as the sentence embedding
134
- embeddings = outputs.last_hidden_state.mean(dim=1)
135
- return embeddings[0].tolist() # Convert to list for consistency
136
- else:
137
- raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
138
-
139
-
140
- def create_all_embeddings(api_choice: str, model_or_url: str) -> str:
141
- try:
142
- all_content = get_all_content_from_database()
143
-
144
- if not all_content:
145
- return "No content found in the database."
146
-
147
- texts_to_embed = []
148
- embeddings_to_store = []
149
- ids_to_store = []
150
- collection_name = "all_content_embeddings"
151
-
152
- # Initialize or get the ChromaDB collection
153
- collection = chroma_client.get_or_create_collection(name=collection_name)
154
-
155
- for content_item in all_content:
156
- media_id = content_item['id']
157
- text = content_item['content']
158
-
159
- # Check if the embedding already exists in ChromaDB
160
- embedding_exists = collection.get(ids=[f"doc_{media_id}"])
161
-
162
- if embedding_exists:
163
- logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
164
- continue # Skip if embedding already exists
165
-
166
- # Create the embedding
167
- if api_choice == "openai":
168
- embedding = create_openai_embedding(text, model_or_url)
169
- else: # Llama.cpp
170
- embedding = create_llamacpp_embedding(text, model_or_url)
171
-
172
- # Collect the text, embedding, and ID for batch storage
173
- texts_to_embed.append(text)
174
- embeddings_to_store.append(embedding)
175
- ids_to_store.append(f"doc_{media_id}")
176
-
177
- # Store all new embeddings in ChromaDB
178
- if texts_to_embed and embeddings_to_store:
179
- store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
180
-
181
- return "Embeddings created and stored successfully for all new content."
182
- except Exception as e:
183
- logging.error(f"Error during embedding creation: {str(e)}")
184
- return f"Error: {str(e)}"
185
-
186
-
187
- def create_openai_embedding(text: str, model: str) -> List[float]:
188
- openai_api_key = config['API']['openai_api_key']
189
- embedding = get_openai_embeddings(text, model)
190
- return embedding
191
-
192
-
193
- def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
194
- response = requests.post(
195
- api_url,
196
- json={"input": text}
197
- )
198
- if response.status_code == 200:
199
- return response.json()['embedding']
200
- else:
201
- raise Exception(f"Error from Llama.cpp API: {response.text}")
202
-
203
-
204
- def get_all_content_from_database() -> List[Dict[str, Any]]:
205
- """
206
- Retrieve all media content from the database that requires embedding.
207
-
208
- Returns:
209
- List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
210
- """
211
- try:
212
- from App_Function_Libraries.DB.DB_Manager import db
213
- with db.get_connection() as conn:
214
- cursor = conn.cursor()
215
- cursor.execute("""
216
- SELECT id, content, title, author, type
217
- FROM Media
218
- WHERE is_trash = 0 -- Exclude items marked as trash
219
- """)
220
- media_items = cursor.fetchall()
221
-
222
- # Convert the results into a list of dictionaries
223
- all_content = [
224
- {
225
- 'id': item[0],
226
- 'content': item[1],
227
- 'title': item[2],
228
- 'author': item[3],
229
- 'type': item[4]
230
- }
231
- for item in media_items
232
- ]
233
-
234
- return all_content
235
-
236
- except sqlite3.Error as e:
237
- logging.error(f"Error retrieving all content from database: {e}")
238
- from App_Function_Libraries.DB.SQLite_DB import DatabaseError
239
- raise DatabaseError(f"Error retrieving all content from database: {e}")
240
-
241
-
242
- def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]):
243
- collection = chroma_client.get_or_create_collection(name=collection_name)
244
- collection.add(
245
- documents=texts,
246
- embeddings=embeddings,
247
- ids=ids,
248
- metadatas=[{'source': source} for source in sources]
249
- )
250
-
251
-
252
- def check_embedding_status(selected_item):
253
- if not selected_item:
254
- return "Please select an item", ""
255
- item_id = selected_item.split('(')[0].strip()
256
- collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
257
- result = collection.get(ids=[f"doc_{item_id}"])
258
- if result['ids']:
259
- embedding = result['embeddings'][0]
260
- embedding_preview = str(embedding[:50]) # Convert first 50 elements to string
261
- return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
262
- else:
263
- return f"No embedding found for item: {item_id}", ""
264
-
265
-
266
- def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
267
- if not selected_item:
268
- return "Please select an item"
269
- item_id = selected_item.split('(')[0].strip()
270
- items = get_all_content_from_database()
271
- item = next((item for item in items if item['title'] == item_id), None)
272
- if not item:
273
- return f"Item not found: {item_id}"
274
-
275
- try:
276
- if api_choice == "OpenAI":
277
- embedding = create_embedding(item['content'])
278
- else: # Llama.cpp
279
- embedding = create_embedding(item['content'])
280
-
281
- collection_name = "all_content_embeddings"
282
- store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
283
- return f"New embedding created and stored for item: {item_id}"
284
- except Exception as e:
285
- return f"Error creating embedding: {str(e)}"
286
-
287
-
288
- #
289
- # End of Functions for ChromaDB
290
  #######################################################################################################################
 
1
+ # ChromaDB_Library.py
2
+ # Description: Functions for managing embeddings in ChromaDB
3
+ #
4
+ # Imports:
5
+ import logging
6
+ from typing import List, Dict, Any
7
+ # 3rd-Party Imports:
8
+ import chromadb
9
+ from chromadb import Settings
10
+ from itertools import islice
11
+ #
12
+ # Local Imports:
13
+ from App_Function_Libraries.Chunk_Lib import chunk_for_embedding, chunk_options
14
+ from App_Function_Libraries.DB.SQLite_DB import process_chunks
15
+ from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
16
+ # FIXME - related to Chunking
17
+ from App_Function_Libraries.RAG.Embeddings_Create import create_embedding
18
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize
19
+ from App_Function_Libraries.Utils.Utils import get_database_path, ensure_directory_exists, \
20
+ load_comprehensive_config
21
+ #
22
+ #######################################################################################################################
23
+ #
24
+ # Config Settings for ChromaDB Functions
25
+ #
26
+ # FIXME - Refactor so that all globals are set in summarize.py
27
+ # Set up logging
28
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
29
+ logger = logging.getLogger(__name__)
30
+ #
31
+ # Load config
32
+ config = load_comprehensive_config()
33
+ #
34
+ # ChromaDB settings
35
+ chroma_db_path = config.get('Database', 'chroma_db_path', fallback=get_database_path('chroma_db'))
36
+ ensure_directory_exists(chroma_db_path)
37
+ chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
38
+ #
39
+ # Embedding settings
40
+ embedding_provider = config.get('Embeddings', 'embedding_provider', fallback='openai')
41
+ embedding_model = config.get('Embeddings', 'embedding_model', fallback='text-embedding-3-small')
42
+ embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
43
+ embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
44
+ #
45
+ # End of Config Settings
46
+ #######################################################################################################################
47
+ #
48
+ # Functions:
49
+
50
+ def batched(iterable, n):
51
+ "Batch data into lists of length n. The last batch may be shorter."
52
+ it = iter(iterable)
53
+ while True:
54
+ batch = list(islice(it, n))
55
+ if not batch:
56
+ return
57
+ yield batch
58
+
59
+
60
+ # FIXME - Fix summarization of entire document/storign in chunk issue
61
+ # FIXME - update all uses to reflect 'api_name' parameter
62
+ def process_and_store_content(database, content: str, collection_name: str, media_id: int, file_name: str,
63
+ create_embeddings: bool = False, create_summary: bool = False, api_name: str = None,
64
+ chunk_options: Dict = None, embedding_provider: str = None,
65
+ embedding_model: str = None, embedding_api_url: str = None):
66
+ try:
67
+ logger.info(f"Processing content for media_id {media_id} in collection {collection_name}")
68
+
69
+ full_summary = None
70
+ if create_summary and api_name:
71
+ full_summary = summarize(content, None, api_name, None, None, None)
72
+
73
+ chunks = chunk_for_embedding(content, file_name, full_summary, chunk_options)
74
+
75
+ # Process chunks synchronously
76
+ process_chunks(database, chunks, media_id)
77
+
78
+ if create_embeddings:
79
+ texts = [chunk['text'] for chunk in chunks]
80
+ embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
81
+ ids = [f"{media_id}_chunk_{i}" for i in range(1, len(chunks) + 1)]
82
+ metadatas = [{
83
+ "media_id": str(media_id),
84
+ "chunk_index": i,
85
+ "total_chunks": len(chunks),
86
+ "start_index": int(chunk['metadata']['start_index']),
87
+ "end_index": int(chunk['metadata']['end_index']),
88
+ "file_name": str(file_name),
89
+ "relative_position": float(chunk['metadata']['relative_position'])
90
+ } for i, chunk in enumerate(chunks, 1)]
91
+
92
+ store_in_chroma(collection_name, texts, embeddings, ids, metadatas)
93
+
94
+ # Update full-text search index
95
+ database.execute_query(
96
+ "INSERT OR REPLACE INTO media_fts (rowid, title, content) SELECT id, title, content FROM Media WHERE id = ?",
97
+ (media_id,)
98
+ )
99
+
100
+ logger.info(f"Finished processing and storing content for media_id {media_id}")
101
+
102
+ except Exception as e:
103
+ logger.error(f"Error in process_and_store_content for media_id {media_id}: {str(e)}")
104
+ raise
105
+
106
+ # Usage example:
107
+ # process_and_store_content(db, content, "my_collection", 1, "example.txt", create_embeddings=True, create_summary=True, api_name="gpt-3.5-turbo")
108
+
109
+
110
+ def check_embedding_status(selected_item, item_mapping):
111
+ if not selected_item:
112
+ return "Please select an item", ""
113
+
114
+ try:
115
+ item_id = item_mapping.get(selected_item)
116
+ if item_id is None:
117
+ return f"Invalid item selected: {selected_item}", ""
118
+
119
+ item_title = selected_item.rsplit(' (', 1)[0]
120
+ collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
121
+
122
+ result = collection.get(ids=[f"doc_{item_id}"], include=["embeddings", "metadatas"])
123
+ logging.info(f"ChromaDB result for item '{item_title}' (ID: {item_id}): {result}")
124
+
125
+ if not result['ids']:
126
+ return f"No embedding found for item '{item_title}' (ID: {item_id})", ""
127
+
128
+ if not result['embeddings'] or not result['embeddings'][0]:
129
+ return f"Embedding data missing for item '{item_title}' (ID: {item_id})", ""
130
+
131
+ embedding = result['embeddings'][0]
132
+ metadata = result['metadatas'][0] if result['metadatas'] else {}
133
+ embedding_preview = str(embedding[:50])
134
+ status = f"Embedding exists for item '{item_title}' (ID: {item_id})"
135
+ return status, f"First 50 elements of embedding:\n{embedding_preview}\n\nMetadata: {metadata}"
136
+
137
+ except Exception as e:
138
+ logging.error(f"Error in check_embedding_status: {str(e)}")
139
+ return f"Error processing item: {selected_item}. Details: {str(e)}", ""
140
+
141
+ def reset_chroma_collection(collection_name: str):
142
+ try:
143
+ chroma_client.delete_collection(collection_name)
144
+ chroma_client.create_collection(collection_name)
145
+ logging.info(f"Reset ChromaDB collection: {collection_name}")
146
+ except Exception as e:
147
+ logging.error(f"Error resetting ChromaDB collection: {str(e)}")
148
+
149
+
150
+ def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], metadatas: List[Dict[str, Any]]):
151
+ try:
152
+ collection = chroma_client.get_or_create_collection(name=collection_name)
153
+
154
+ # Log the inputs for debugging
155
+ logging.debug(f"Storing in ChromaDB - Collection: {collection_name}")
156
+ logging.debug(f"Texts (first 100 chars): {texts[0][:100]}...")
157
+ logging.debug(f"Embeddings (first 5 values): {embeddings[0][:5]}")
158
+ logging.debug(f"IDs: {ids}")
159
+ logging.debug(f"Metadatas: {metadatas}")
160
+
161
+ # Use upsert instead of add/update
162
+ collection.upsert(
163
+ documents=texts,
164
+ embeddings=embeddings,
165
+ ids=ids,
166
+ metadatas=metadatas
167
+ )
168
+
169
+ # Verify storage
170
+ for doc_id in ids:
171
+ result = collection.get(ids=[doc_id], include=["embeddings"])
172
+ if not result['embeddings'] or result['embeddings'][0] is None:
173
+ logging.error(f"Failed to store embedding for {doc_id}")
174
+ else:
175
+ logging.info(f"Embedding stored successfully for {doc_id}")
176
+
177
+ except Exception as e:
178
+ logging.error(f"Error storing embeddings in ChromaDB: {str(e)}")
179
+ raise
180
+
181
+
182
+ # Function to perform vector search using ChromaDB + Keywords from the media_db
183
+ def vector_search(collection_name: str, query: str, k: int = 10) -> List[Dict[str, Any]]:
184
+ try:
185
+ query_embedding = create_embedding(query, embedding_provider, embedding_model, embedding_api_url)
186
+ collection = chroma_client.get_collection(name=collection_name)
187
+ results = collection.query(
188
+ query_embeddings=[query_embedding],
189
+ n_results=k,
190
+ include=["documents", "metadatas"]
191
+ )
192
+ return [{"content": doc, "metadata": meta} for doc, meta in zip(results['documents'][0], results['metadatas'][0])]
193
+ except Exception as e:
194
+ logging.error(f"Error in vector_search: {str(e)}")
195
+ raise
196
+
197
+ def schedule_embedding(media_id: int, content: str, media_name: str, summary: str):
198
+ try:
199
+ chunks = chunk_for_embedding(content, media_name, summary, chunk_options)
200
+ texts = [chunk['text'] for chunk in chunks]
201
+ embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
202
+ ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]
203
+ metadatas = [{
204
+ "media_id": str(media_id),
205
+ "chunk_index": i,
206
+ "total_chunks": len(chunks),
207
+ "start_index": chunk['metadata']['start_index'],
208
+ "end_index": chunk['metadata']['end_index'],
209
+ "file_name": media_name,
210
+ "relative_position": chunk['metadata']['relative_position']
211
+ } for i, chunk in enumerate(chunks)]
212
+
213
+ store_in_chroma("all_content_embeddings", texts, embeddings, ids, metadatas)
214
+
215
+ except Exception as e:
216
+ logging.error(f"Error scheduling embedding for media_id {media_id}: {str(e)}")
217
+
218
+
219
+ # Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
220
+ # def process_and_store_content(content: str, collection_name: str, media_id: int):
221
+ # # Process the content into chunks
222
+ # chunks = improved_chunking_process(content, chunk_options)
223
+ # texts = [chunk['text'] for chunk in chunks]
224
+ #
225
+ # # Generate embeddings for each chunk
226
+ # embeddings = [create_embedding(text) for text in texts]
227
+ #
228
+ # # Create unique IDs for each chunk using the media_id and chunk index
229
+ # ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
230
+ #
231
+ # # Store the texts, embeddings, and IDs in ChromaDB
232
+ # store_in_chroma(collection_name, texts, embeddings, ids)
233
+ #
234
+ # # Store the chunk metadata in SQLite
235
+ # for i, chunk in enumerate(chunks):
236
+ # add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
237
+ #
238
+ # # Update the FTS table
239
+ # update_fts_for_media(media_id)
240
+
241
+
242
+ #
243
+ # End of Functions for ChromaDB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  #######################################################################################################################
App_Function_Libraries/RAG/Embeddings_Create.py CHANGED
@@ -1,168 +1,224 @@
1
- # Embeddings_Create.py
2
- # Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
3
- #
4
- # Imports:
5
- import logging
6
- from typing import List, Dict, Any
7
-
8
- import numpy as np
9
- #
10
- # 3rd-Party Imports:
11
- import requests
12
- from transformers import AutoTokenizer, AutoModel
13
- import torch
14
- #
15
- # Local Imports:
16
- from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
17
- from App_Function_Libraries.Summarization_General_Lib import summarize
18
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
19
- from App_Function_Libraries.Chunk_Lib import chunk_options, improved_chunking_process#, determine_chunk_position
20
- #
21
- #
22
- #######################################################################################################################
23
- #
24
- # Functions:
25
-
26
- # FIXME - Add all globals to summarize.py
27
- loaded_config = load_comprehensive_config()
28
- embedding_provider = 'openai'
29
-
30
- embedding_model = 'text-embedding-3-large'
31
- embedding_api_url = 'https://127.0.0.1'
32
- embedding_api_key = 'busted_api_key_lol'
33
-
34
- # Embedding Chunking Settings
35
- chunk_size = '500'
36
- overlap = '200'
37
-
38
-
39
- # FIXME - Add logging
40
-
41
- # FIXME - refactor/setup to use config file & perform chunking
42
- def create_embedding(text: str, provider: str, model: str, api_url: str = None, api_key: str = None) -> List[float]:
43
- try:
44
- if provider == 'openai':
45
- embedding = get_openai_embeddings(text, model)
46
- elif provider == 'local':
47
- embedding = create_local_embedding(text, model, api_url, api_key)
48
- elif provider == 'huggingface':
49
- embedding = create_huggingface_embedding(text, model)
50
- elif provider == 'llamacpp':
51
- embedding = create_llamacpp_embedding(text, api_url)
52
- else:
53
- raise ValueError(f"Unsupported embedding provider: {provider}")
54
-
55
- if isinstance(embedding, np.ndarray):
56
- embedding = embedding.tolist()
57
- elif isinstance(embedding, torch.Tensor):
58
- embedding = embedding.detach().cpu().numpy().tolist()
59
-
60
- return embedding
61
-
62
- except Exception as e:
63
- logging.error(f"Error creating embedding: {str(e)}")
64
- raise
65
-
66
-
67
- def create_huggingface_embedding(text: str, model: str) -> List[float]:
68
- tokenizer = AutoTokenizer.from_pretrained(model)
69
- model = AutoModel.from_pretrained(model)
70
-
71
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
72
- with torch.no_grad():
73
- outputs = model(**inputs)
74
-
75
- embeddings = outputs.last_hidden_state.mean(dim=1)
76
- return embeddings[0].tolist()
77
-
78
-
79
- # FIXME
80
- def create_stella_embeddings(text: str) -> List[float]:
81
- if embedding_provider == 'local':
82
- # Load the model and tokenizer
83
- tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
84
- model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
85
-
86
- # Tokenize and encode the text
87
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
88
-
89
- # Generate embeddings
90
- with torch.no_grad():
91
- outputs = model(**inputs)
92
-
93
- # Use the mean of the last hidden state as the sentence embedding
94
- embeddings = outputs.last_hidden_state.mean(dim=1)
95
-
96
- return embeddings[0].tolist() # Convert to list for consistency
97
- elif embedding_provider == 'openai':
98
- return get_openai_embeddings(text, embedding_model)
99
- else:
100
- raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
101
-
102
-
103
- def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
104
- response = requests.post(
105
- api_url,
106
- json={"input": text}
107
- )
108
- response.raise_for_status()
109
- return response.json()['embedding']
110
-
111
-
112
- def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
113
- response = requests.post(
114
- api_url,
115
- json={"text": text, "model": model},
116
- headers={"Authorization": f"Bearer {api_key}"}
117
- )
118
- response.raise_for_status()
119
- return response.json().get('embedding', None)
120
-
121
-
122
- def chunk_for_embedding(text: str, file_name: str, api_name, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
123
- options = chunk_options.copy()
124
- if custom_chunk_options:
125
- options.update(custom_chunk_options)
126
-
127
-
128
- # FIXME
129
- if api_name is not None:
130
- # Generate summary of the full document
131
- full_summary = summarize(text, None, api_name, None, None, None)
132
- else:
133
- full_summary = "Full document summary not available."
134
-
135
- chunks = improved_chunking_process(text, options)
136
- total_chunks = len(chunks)
137
-
138
- chunked_text_with_headers = []
139
- for i, chunk in enumerate(chunks, 1):
140
- chunk_text = chunk['text']
141
- chunk_position = 1#DIRTY HACK #determine_chunk_position(chunk['metadata']['relative_position'])
142
-
143
- chunk_header = f"""
144
- Original Document: {file_name}
145
- Full Document Summary: {full_summary}
146
- Chunk: {i} of {total_chunks}
147
- Position: {chunk_position}
148
-
149
- --- Chunk Content ---
150
- """
151
-
152
- full_chunk_text = chunk_header + chunk_text
153
- chunk['text'] = full_chunk_text
154
- chunk['metadata']['file_name'] = file_name
155
- chunked_text_with_headers.append(chunk)
156
-
157
- return chunked_text_with_headers
158
-
159
-
160
- def create_openai_embedding(text: str, model: str) -> List[float]:
161
- embedding = get_openai_embeddings(text, model)
162
- return embedding
163
-
164
-
165
-
166
- #
167
- # End of File.
168
- #######################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Embeddings_Create.py
2
+ # Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
3
+ #
4
+ # Imports:
5
+ import logging
6
+ import time
7
+ from functools import wraps
8
+ from threading import Lock, Timer
9
+ from typing import List
10
+ #
11
+ # 3rd-Party Imports:
12
+ import requests
13
+ from transformers import AutoTokenizer, AutoModel
14
+ import torch
15
+ #
16
+ # Local Imports:
17
+ from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
18
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
19
+ #
20
+ #######################################################################################################################
21
+ #
22
+ # Functions:
23
+
24
+ # FIXME - Add all globals to summarize.py
25
+ loaded_config = load_comprehensive_config()
26
+ embedding_provider = loaded_config['Embeddings']['embedding_provider']
27
+ embedding_model = loaded_config['Embeddings']['embedding_model']
28
+ embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
29
+ embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
30
+
31
+ # Embedding Chunking Settings
32
+ chunk_size = loaded_config['Embeddings']['chunk_size']
33
+ overlap = loaded_config['Embeddings']['overlap']
34
+
35
+
36
+ # FIXME - Add logging
37
+
38
+
39
+ class HuggingFaceEmbedder:
40
+ def __init__(self, model_name, timeout_seconds=120): # Default timeout of 2 minutes
41
+ self.model_name = model_name
42
+ self.tokenizer = None
43
+ self.model = None
44
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+ self.timeout_seconds = timeout_seconds
46
+ self.last_used_time = 0
47
+ self.unload_timer = None
48
+
49
+ def load_model(self):
50
+ if self.model is None:
51
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
52
+ self.model = AutoModel.from_pretrained(self.model_name)
53
+ self.model.to(self.device)
54
+ self.last_used_time = time.time()
55
+ self.reset_timer()
56
+
57
+ def unload_model(self):
58
+ if self.model is not None:
59
+ del self.model
60
+ del self.tokenizer
61
+ if torch.cuda.is_available():
62
+ torch.cuda.empty_cache()
63
+ self.model = None
64
+ self.tokenizer = None
65
+ if self.unload_timer:
66
+ self.unload_timer.cancel()
67
+
68
+ def reset_timer(self):
69
+ if self.unload_timer:
70
+ self.unload_timer.cancel()
71
+ self.unload_timer = Timer(self.timeout_seconds, self.unload_model)
72
+ self.unload_timer.start()
73
+
74
+ def create_embeddings(self, texts):
75
+ self.load_model()
76
+ inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
77
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
78
+ with torch.no_grad():
79
+ outputs = self.model(**inputs)
80
+ embeddings = outputs.last_hidden_state.mean(dim=1)
81
+ return embeddings.cpu().numpy()
82
+
83
+ # Global variable to hold the embedder
84
+ huggingface_embedder = None
85
+
86
+
87
+ class RateLimiter:
88
+ def __init__(self, max_calls, period):
89
+ self.max_calls = max_calls
90
+ self.period = period
91
+ self.calls = []
92
+ self.lock = Lock()
93
+
94
+ def __call__(self, func):
95
+ def wrapper(*args, **kwargs):
96
+ with self.lock:
97
+ now = time.time()
98
+ self.calls = [call for call in self.calls if call > now - self.period]
99
+ if len(self.calls) >= self.max_calls:
100
+ sleep_time = self.calls[0] - (now - self.period)
101
+ time.sleep(sleep_time)
102
+ self.calls.append(time.time())
103
+ return func(*args, **kwargs)
104
+ return wrapper
105
+
106
+
107
+ def exponential_backoff(max_retries=5, base_delay=1):
108
+ def decorator(func):
109
+ @wraps(func)
110
+ def wrapper(*args, **kwargs):
111
+ for attempt in range(max_retries):
112
+ try:
113
+ return func(*args, **kwargs)
114
+ except Exception as e:
115
+ if attempt == max_retries - 1:
116
+ raise
117
+ delay = base_delay * (2 ** attempt)
118
+ logging.warning(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds. Error: {str(e)}")
119
+ time.sleep(delay)
120
+ return wrapper
121
+ return decorator
122
+
123
+
124
+ # FIXME - refactor/setup to use config file & perform chunking
125
+ @exponential_backoff()
126
+ @RateLimiter(max_calls=50, period=60) # Adjust these values based on API limits
127
+ def create_embeddings_batch(texts: List[str], provider: str, model: str, api_url: str, timeout_seconds: int = 300) -> \
128
+ List[List[float]]:
129
+ global huggingface_embedder
130
+
131
+ if provider.lower() == 'huggingface':
132
+ if huggingface_embedder is None or huggingface_embedder.model_name != model:
133
+ if huggingface_embedder is not None:
134
+ huggingface_embedder.unload_model()
135
+ huggingface_embedder = HuggingFaceEmbedder(model, timeout_seconds)
136
+
137
+ embeddings = huggingface_embedder.create_embeddings(texts).tolist()
138
+ return embeddings
139
+
140
+ elif provider.lower() == 'openai':
141
+ logging.debug(f"Creating embeddings for {len(texts)} texts using OpenAI API")
142
+ return [create_openai_embedding(text, model) for text in texts]
143
+
144
+ elif provider.lower() == 'local':
145
+ response = requests.post(
146
+ api_url,
147
+ json={"texts": texts, "model": model},
148
+ headers={"Authorization": f"Bearer {embedding_api_key}"}
149
+ )
150
+ if response.status_code == 200:
151
+ return response.json()['embeddings']
152
+ else:
153
+ raise Exception(f"Error from local API: {response.text}")
154
+ else:
155
+ raise ValueError(f"Unsupported embedding provider: {provider}")
156
+
157
+ def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
158
+ return create_embeddings_batch([text], provider, model, api_url)[0]
159
+
160
+ # FIXME
161
+ def create_stella_embeddings(text: str) -> List[float]:
162
+ if embedding_provider == 'local':
163
+ # Load the model and tokenizer
164
+ tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
165
+ model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
166
+
167
+ # Tokenize and encode the text
168
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
169
+
170
+ # Generate embeddings
171
+ with torch.no_grad():
172
+ outputs = model(**inputs)
173
+
174
+ # Use the mean of the last hidden state as the sentence embedding
175
+ embeddings = outputs.last_hidden_state.mean(dim=1)
176
+
177
+ return embeddings[0].tolist() # Convert to list for consistency
178
+ elif embedding_provider == 'openai':
179
+ return get_openai_embeddings(text, embedding_model)
180
+ else:
181
+ raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
182
+
183
+
184
+ def create_openai_embedding(text: str, model: str) -> List[float]:
185
+ embedding = get_openai_embeddings(text, model)
186
+ return embedding
187
+
188
+
189
+
190
+
191
+ #Dead
192
+ # def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
193
+ # response = requests.post(
194
+ # api_url,
195
+ # json={"text": text, "model": model},
196
+ # headers={"Authorization": f"Bearer {api_key}"}
197
+ # )
198
+ # response.raise_for_status()
199
+ # return response.json().get('embedding', None)
200
+
201
+ # Dead
202
+ # def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
203
+ # response = requests.post(
204
+ # api_url,
205
+ # json={"input": text}
206
+ # )
207
+ # response.raise_for_status()
208
+ # return response.json()['embedding']
209
+
210
+ # dead
211
+ # def create_huggingface_embedding(text: str, model: str) -> List[float]:
212
+ # tokenizer = AutoTokenizer.from_pretrained(model)
213
+ # model = AutoModel.from_pretrained(model)
214
+ #
215
+ # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
216
+ # with torch.no_grad():
217
+ # outputs = model(**inputs)
218
+ #
219
+ # embeddings = outputs.last_hidden_state.mean(dim=1)
220
+ # return embeddings[0].tolist()
221
+
222
+ #
223
+ # End of File.
224
+ #######################################################################################################################
App_Function_Libraries/RAG/RAG_Libary_2.py CHANGED
@@ -1,210 +1,340 @@
1
- # RAG_Library_2.py
2
- # Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline.
3
- #
4
- # Import necessary modules and functions
5
- import configparser
6
- import logging
7
- import os
8
- from typing import Dict, Any, List, Optional
9
- # Local Imports
10
- #from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
11
- from App_Function_Libraries.Article_Extractor_Lib import scrape_article
12
- from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
13
- fetch_keywords_for_media
14
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
15
- #
16
- # 3rd-Party Imports
17
- import openai
18
- #
19
- ########################################################################################################################
20
- #
21
- # Functions:
22
-
23
- # Initialize OpenAI client (adjust this based on your API key management)
24
- openai.api_key = "your-openai-api-key"
25
-
26
- # Get the directory of the current script
27
- current_dir = os.path.dirname(os.path.abspath(__file__))
28
- # Construct the path to the config file
29
- config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
30
- # Read the config file
31
- config = configparser.ConfigParser()
32
- # Read the configuration file
33
- config.read('config.txt')
34
-
35
-
36
-
37
-
38
- # RAG Search with keyword filtering
39
- def enhanced_rag_pipeline(query: str, api_choice: str, keywords: str = None) -> Dict[str, Any]:
40
- try:
41
- # Load embedding provider from config, or fallback to 'openai'
42
- embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
43
-
44
- # Log the provider used
45
- logging.debug(f"Using embedding provider: {embedding_provider}")
46
-
47
- # Process keywords if provided
48
- keyword_list = [k.strip().lower() for k in keywords.split(',')] if keywords else []
49
- logging.debug(f"enhanced_rag_pipeline - Keywords: {keyword_list}")
50
-
51
- # Fetch relevant media IDs based on keywords if keywords are provided
52
- relevant_media_ids = fetch_relevant_media_ids(keyword_list) if keyword_list else None
53
- logging.debug(f"enhanced_rag_pipeline - relevant media IDs: {relevant_media_ids}")
54
-
55
- # Perform vector search
56
- vector_results = perform_vector_search(query, relevant_media_ids)
57
- logging.debug(f"enhanced_rag_pipeline - Vector search results: {vector_results}")
58
-
59
- # Perform full-text search
60
- fts_results = perform_full_text_search(query, relevant_media_ids)
61
- logging.debug(f"enhanced_rag_pipeline - Full-text search results: {fts_results}")
62
-
63
- # Combine results
64
- all_results = vector_results + fts_results
65
- # FIXME
66
- if not all_results:
67
- logging.info(f"No results found. Query: {query}, Keywords: {keywords}")
68
- return {
69
- "answer": "I couldn't find any relevant information based on your query and keywords.",
70
- "context": ""
71
- }
72
-
73
- # FIXME - Apply Re-Ranking of results here
74
- apply_re_ranking = False
75
- if apply_re_ranking:
76
- # Implement re-ranking logic here
77
- pass
78
- # Extract content from results
79
- context = "\n".join([result['content'] for result in all_results[:10]]) # Limit to top 10 results
80
- logging.debug(f"Context length: {len(context)}")
81
- logging.debug(f"Context: {context[:200]}")
82
- # Generate answer using the selected API
83
- answer = generate_answer(api_choice, context, query)
84
-
85
- return {
86
- "answer": answer,
87
- "context": context
88
- }
89
- except Exception as e:
90
- logging.error(f"Error in enhanced_rag_pipeline: {str(e)}")
91
- return {
92
- "answer": "An error occurred while processing your request.",
93
- "context": ""
94
- }
95
-
96
-
97
-
98
- def generate_answer(api_choice: str, context: str, query: str) -> str:
99
- logging.debug("Entering generate_answer function")
100
- config = load_comprehensive_config()
101
- logging.debug(f"Config sections: {config.sections()}")
102
- prompt = f"Context: {context}\n\nQuestion: {query}"
103
- if api_choice == "OpenAI":
104
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai
105
- return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
106
- elif api_choice == "Anthropic":
107
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic
108
- return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
109
- elif api_choice == "Cohere":
110
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere
111
- return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
112
- elif api_choice == "Groq":
113
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq
114
- return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
115
- elif api_choice == "OpenRouter":
116
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter
117
- return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
118
- elif api_choice == "HuggingFace":
119
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface
120
- return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
121
- elif api_choice == "DeepSeek":
122
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek
123
- return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
124
- elif api_choice == "Mistral":
125
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral
126
- return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
127
- elif api_choice == "Local-LLM":
128
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm
129
- return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
130
- elif api_choice == "Llama.cpp":
131
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama
132
- return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
133
- elif api_choice == "Kobold":
134
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold
135
- return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
136
- elif api_choice == "Ooba":
137
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga
138
- return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
139
- elif api_choice == "TabbyAPI":
140
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi
141
- return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
142
- elif api_choice == "vLLM":
143
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm
144
- return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
145
- elif api_choice == "ollama":
146
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama
147
- return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
148
- else:
149
- raise ValueError(f"Unsupported API choice: {api_choice}")
150
-
151
-
152
- def perform_full_text_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
153
- fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
154
- filtered_fts_results = [
155
- {
156
- "content": result['content'],
157
- "metadata": {"media_id": result['id']}
158
- }
159
- for result in fts_results
160
- if relevant_media_ids is None or result['id'] in relevant_media_ids
161
- ]
162
- return filtered_fts_results
163
-
164
-
165
- def fetch_relevant_media_ids(keywords: List[str]) -> List[int]:
166
- relevant_ids = set()
167
- try:
168
- for keyword in keywords:
169
- media_ids = fetch_keywords_for_media(keyword)
170
- relevant_ids.update(media_ids)
171
- except Exception as e:
172
- logging.error(f"Error fetching relevant media IDs: {str(e)}")
173
- return list(relevant_ids)
174
-
175
-
176
-
177
-
178
- # Example usage:
179
- # 1. Initialize the system:
180
- # create_tables(db) # Ensure FTS tables are set up
181
- #
182
- # 2. Create ChromaDB
183
- # chroma_client = ChromaDBClient()
184
- #
185
- # 3. Create Embeddings
186
- # Store embeddings in ChromaDB
187
- # preprocess_all_content() or create_embeddings()
188
- #
189
- # 4. Perform RAG search across all content:
190
- # result = rag_search("What are the key points about climate change?")
191
- # print(result['answer'])
192
- #
193
- # (Extra)5. Perform RAG on a specific URL:
194
- # result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
195
- # print(result['answer'])
196
- #
197
- ########################################################################################################################
198
-
199
-
200
- ############################################################################################################
201
- #
202
- # ElasticSearch Retriever
203
-
204
- # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
205
- #
206
- # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
207
-
208
- #
209
- # End of RAG_Library_2.py
210
- ############################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG_Library_2.py
2
+ # Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline.
3
+ #
4
+ # Import necessary modules and functions
5
+ import configparser
6
+ import logging
7
+ import os
8
+ from typing import Dict, Any, List, Optional
9
+ # Local Imports
10
+ from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
11
+ from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
12
+ from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
13
+ fetch_keywords_for_media
14
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
15
+ #
16
+ # 3rd-Party Imports
17
+ import openai
18
+ #
19
+ ########################################################################################################################
20
+ #
21
+ # Functions:
22
+
23
+ # Initialize OpenAI client (adjust this based on your API key management)
24
+ openai.api_key = "your-openai-api-key"
25
+
26
+ # Get the directory of the current script
27
+ current_dir = os.path.dirname(os.path.abspath(__file__))
28
+ # Construct the path to the config file
29
+ config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
30
+ # Read the config file
31
+ config = configparser.ConfigParser()
32
+ # Read the configuration file
33
+ config.read('config.txt')
34
+
35
+ # Main RAG pipeline function
36
+ def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
37
+ try:
38
+ # Extract content
39
+ try:
40
+ article_data = scrape_article(url)
41
+ content = article_data['content']
42
+ title = article_data['title']
43
+ except Exception as e:
44
+ logging.error(f"Error scraping article: {str(e)}")
45
+ return {"error": "Failed to scrape article", "details": str(e)}
46
+
47
+ # Store the article in the database and get the media_id
48
+ try:
49
+ media_id = add_media_to_database(url, title, 'article', content)
50
+ except Exception as e:
51
+ logging.error(f"Error adding article to database: {str(e)}")
52
+ return {"error": "Failed to store article in database", "details": str(e)}
53
+
54
+ # Process and store content
55
+ collection_name = f"article_{media_id}"
56
+ try:
57
+ # FIXME
58
+ # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
59
+ # create_embeddings: bool = False, create_summary: bool = False,
60
+ # api_name: str = None):
61
+ process_and_store_content(content, collection_name, media_id, title)
62
+ except Exception as e:
63
+ logging.error(f"Error processing and storing content: {str(e)}")
64
+ return {"error": "Failed to process and store content", "details": str(e)}
65
+
66
+ # Perform searches
67
+ try:
68
+ vector_results = vector_search(collection_name, query, k=5)
69
+ fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
70
+ except Exception as e:
71
+ logging.error(f"Error performing searches: {str(e)}")
72
+ return {"error": "Failed to perform searches", "details": str(e)}
73
+
74
+ # Combine results with error handling for missing 'content' key
75
+ all_results = []
76
+ for result in vector_results + fts_results:
77
+ if isinstance(result, dict) and 'content' in result:
78
+ all_results.append(result['content'])
79
+ else:
80
+ logging.warning(f"Unexpected result format: {result}")
81
+ all_results.append(str(result))
82
+
83
+ context = "\n".join(all_results)
84
+
85
+ # Generate answer using the selected API
86
+ try:
87
+ answer = generate_answer(api_choice, context, query)
88
+ except Exception as e:
89
+ logging.error(f"Error generating answer: {str(e)}")
90
+ return {"error": "Failed to generate answer", "details": str(e)}
91
+
92
+ return {
93
+ "answer": answer,
94
+ "context": context
95
+ }
96
+
97
+ except Exception as e:
98
+ logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
99
+ return {"error": "An unexpected error occurred", "details": str(e)}
100
+
101
+
102
+
103
+ # RAG Search with keyword filtering
104
+ def enhanced_rag_pipeline(query: str, api_choice: str, keywords: str = None) -> Dict[str, Any]:
105
+ try:
106
+ # Load embedding provider from config, or fallback to 'openai'
107
+ embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
108
+
109
+ # Log the provider used
110
+ logging.debug(f"Using embedding provider: {embedding_provider}")
111
+
112
+ # Process keywords if provided
113
+ keyword_list = [k.strip().lower() for k in keywords.split(',')] if keywords else []
114
+ logging.debug(f"enhanced_rag_pipeline - Keywords: {keyword_list}")
115
+
116
+ # Fetch relevant media IDs based on keywords if keywords are provided
117
+ relevant_media_ids = fetch_relevant_media_ids(keyword_list) if keyword_list else None
118
+ logging.debug(f"enhanced_rag_pipeline - relevant media IDs: {relevant_media_ids}")
119
+
120
+ # Perform vector search
121
+ vector_results = perform_vector_search(query, relevant_media_ids)
122
+ logging.debug(f"enhanced_rag_pipeline - Vector search results: {vector_results}")
123
+
124
+ # Perform full-text search
125
+ fts_results = perform_full_text_search(query, relevant_media_ids)
126
+ logging.debug(f"enhanced_rag_pipeline - Full-text search results: {fts_results}")
127
+
128
+ # Combine results
129
+ all_results = vector_results + fts_results
130
+ # FIXME
131
+ if not all_results:
132
+ logging.info(f"No results found. Query: {query}, Keywords: {keywords}")
133
+ return {
134
+ "answer": "I couldn't find any relevant information based on your query and keywords.",
135
+ "context": ""
136
+ }
137
+
138
+ # FIXME - Apply Re-Ranking of results here
139
+ apply_re_ranking = False
140
+ if apply_re_ranking:
141
+ # Implement re-ranking logic here
142
+ pass
143
+ # Extract content from results
144
+ context = "\n".join([result['content'] for result in all_results[:10]]) # Limit to top 10 results
145
+ logging.debug(f"Context length: {len(context)}")
146
+ logging.debug(f"Context: {context[:200]}")
147
+ # Generate answer using the selected API
148
+ answer = generate_answer(api_choice, context, query)
149
+
150
+ return {
151
+ "answer": answer,
152
+ "context": context
153
+ }
154
+ except Exception as e:
155
+ logging.error(f"Error in enhanced_rag_pipeline: {str(e)}")
156
+ return {
157
+ "answer": "An error occurred while processing your request.",
158
+ "context": ""
159
+ }
160
+
161
+
162
+ def generate_answer(api_choice: str, context: str, query: str) -> str:
163
+ logging.debug("Entering generate_answer function")
164
+ config = load_comprehensive_config()
165
+ logging.debug(f"Config sections: {config.sections()}")
166
+ prompt = f"Context: {context}\n\nQuestion: {query}"
167
+ if api_choice == "OpenAI":
168
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai
169
+ return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
170
+ elif api_choice == "Anthropic":
171
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_anthropic
172
+ return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
173
+ elif api_choice == "Cohere":
174
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_cohere
175
+ return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
176
+ elif api_choice == "Groq":
177
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_groq
178
+ return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
179
+ elif api_choice == "OpenRouter":
180
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openrouter
181
+ return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
182
+ elif api_choice == "HuggingFace":
183
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_huggingface
184
+ return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
185
+ elif api_choice == "DeepSeek":
186
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_deepseek
187
+ return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
188
+ elif api_choice == "Mistral":
189
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_mistral
190
+ return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
191
+ elif api_choice == "Local-LLM":
192
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_local_llm
193
+ return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
194
+ elif api_choice == "Llama.cpp":
195
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama
196
+ return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
197
+ elif api_choice == "Kobold":
198
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_kobold
199
+ return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
200
+ elif api_choice == "Ooba":
201
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_oobabooga
202
+ return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
203
+ elif api_choice == "TabbyAPI":
204
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_tabbyapi
205
+ return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
206
+ elif api_choice == "vLLM":
207
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_vllm
208
+ return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
209
+ elif api_choice == "ollama":
210
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_ollama
211
+ return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
212
+ else:
213
+ raise ValueError(f"Unsupported API choice: {api_choice}")
214
+
215
+ # Function to preprocess and store all existing content in the database
216
+ def preprocess_all_content():
217
+ unprocessed_media = get_unprocessed_media()
218
+ for row in unprocessed_media:
219
+ media_id = row[0]
220
+ content = row[1]
221
+ media_type = row[2]
222
+ collection_name = f"{media_type}_{media_id}"
223
+ # FIXME
224
+ # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
225
+ # create_embeddings: bool = False, create_summary: bool = False,
226
+ # api_name: str = None):
227
+ process_and_store_content(content, collection_name, media_id, "")
228
+
229
+
230
+ def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
231
+ all_collections = chroma_client.list_collections()
232
+ vector_results = []
233
+ for collection in all_collections:
234
+ collection_results = vector_search(collection.name, query, k=5)
235
+ filtered_results = [
236
+ result for result in collection_results
237
+ if relevant_media_ids is None or result['metadata'].get('media_id') in relevant_media_ids
238
+ ]
239
+ vector_results.extend(filtered_results)
240
+ return vector_results
241
+
242
+
243
+ def perform_full_text_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
244
+ fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
245
+ filtered_fts_results = [
246
+ {
247
+ "content": result['content'],
248
+ "metadata": {"media_id": result['id']}
249
+ }
250
+ for result in fts_results
251
+ if relevant_media_ids is None or result['id'] in relevant_media_ids
252
+ ]
253
+ return filtered_fts_results
254
+
255
+
256
+ def fetch_relevant_media_ids(keywords: List[str]) -> List[int]:
257
+ relevant_ids = set()
258
+ try:
259
+ for keyword in keywords:
260
+ media_ids = fetch_keywords_for_media(keyword)
261
+ relevant_ids.update(media_ids)
262
+ except Exception as e:
263
+ logging.error(f"Error fetching relevant media IDs: {str(e)}")
264
+ return list(relevant_ids)
265
+
266
+
267
+ def filter_results_by_keywords(results: List[Dict[str, Any]], keywords: List[str]) -> List[Dict[str, Any]]:
268
+ if not keywords:
269
+ return results
270
+
271
+ filtered_results = []
272
+ for result in results:
273
+ try:
274
+ metadata = result.get('metadata', {})
275
+ if metadata is None:
276
+ logging.warning(f"No metadata found for result: {result}")
277
+ continue
278
+ if not isinstance(metadata, dict):
279
+ logging.warning(f"Unexpected metadata type: {type(metadata)}. Expected dict.")
280
+ continue
281
+
282
+ media_id = metadata.get('media_id')
283
+ if media_id is None:
284
+ logging.warning(f"No media_id found in metadata: {metadata}")
285
+ continue
286
+
287
+ media_keywords = fetch_keywords_for_media(media_id)
288
+ if any(keyword.lower() in [mk.lower() for mk in media_keywords] for keyword in keywords):
289
+ filtered_results.append(result)
290
+ except Exception as e:
291
+ logging.error(f"Error processing result: {result}. Error: {str(e)}")
292
+
293
+ return filtered_results
294
+
295
+ # FIXME: to be implememted
296
+ def extract_media_id_from_result(result: str) -> Optional[int]:
297
+ # Implement this function based on how you store the media_id in your results
298
+ # For example, if it's stored at the beginning of each result:
299
+ try:
300
+ return int(result.split('_')[0])
301
+ except (IndexError, ValueError):
302
+ logging.error(f"Failed to extract media_id from result: {result}")
303
+ return None
304
+
305
+
306
+
307
+
308
+ # Example usage:
309
+ # 1. Initialize the system:
310
+ # create_tables(db) # Ensure FTS tables are set up
311
+ #
312
+ # 2. Create ChromaDB
313
+ # chroma_client = ChromaDBClient()
314
+ #
315
+ # 3. Create Embeddings
316
+ # Store embeddings in ChromaDB
317
+ # preprocess_all_content() or create_embeddings()
318
+ #
319
+ # 4. Perform RAG search across all content:
320
+ # result = rag_search("What are the key points about climate change?")
321
+ # print(result['answer'])
322
+ #
323
+ # (Extra)5. Perform RAG on a specific URL:
324
+ # result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
325
+ # print(result['answer'])
326
+ #
327
+ ########################################################################################################################
328
+
329
+
330
+ ############################################################################################################
331
+ #
332
+ # ElasticSearch Retriever
333
+
334
+ # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
335
+ #
336
+ # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
337
+
338
+ #
339
+ # End of RAG_Library_2.py
340
+ ############################################################################################################
App_Function_Libraries/Summarization/Summarization_General_Lib.py CHANGED
@@ -710,11 +710,9 @@ def summarize_with_openrouter(api_key, input_data, custom_prompt_arg, temp=None,
710
 
711
  def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None):
712
  loaded_config_data = load_and_log_configs()
713
- global huggingface_api_key
714
  logging.debug("HuggingFace: Summarization process starting...")
715
  try:
716
  logging.debug("HuggingFace: Loading and validating configurations")
717
- loaded_config_data = load_and_log_configs()
718
  if loaded_config_data is None:
719
  logging.error("Failed to load configuration data")
720
  huggingface_api_key = None
@@ -726,6 +724,7 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None
726
  else:
727
  # If no parameter is provided, use the key from the config
728
  huggingface_api_key = loaded_config_data['api_keys'].get('huggingface')
 
729
  if huggingface_api_key:
730
  logging.info("HuggingFace: Using API key from config file")
731
  else:
@@ -738,7 +737,6 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None
738
  # FIXME
739
  # For example: raise ValueError("No valid Anthropic API key available")
740
 
741
-
742
  logging.debug(f"HuggingFace: Using API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
743
 
744
  if isinstance(input_data, str) and os.path.isfile(input_data):
@@ -775,21 +773,24 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None
775
  if temp is None:
776
  temp = 0.1
777
  temp = float(temp)
778
- huggingface_prompt = f"{text}\n\n\n\n{custom_prompt_arg}"
779
  logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
780
  data = {
781
- "inputs": text,
782
- "parameters": {"max_length": 512, "min_length": 100} # You can adjust max_length and min_length as needed
 
 
783
  }
784
 
785
  logging.debug("huggingface: Submitting request...")
786
  response = requests.post(API_URL, headers=headers, json=data)
787
 
788
  if response.status_code == 200:
789
- summary = response.json()[0]['generated_text'].strip()
 
790
  logging.debug("huggingface: Summarization successful")
791
- print("Summarization successful.")
792
- return summary
793
  else:
794
  logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
795
  return f"Failed to process summary, status code {response.status_code}: {response.text}"
 
710
 
711
  def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None):
712
  loaded_config_data = load_and_log_configs()
 
713
  logging.debug("HuggingFace: Summarization process starting...")
714
  try:
715
  logging.debug("HuggingFace: Loading and validating configurations")
 
716
  if loaded_config_data is None:
717
  logging.error("Failed to load configuration data")
718
  huggingface_api_key = None
 
724
  else:
725
  # If no parameter is provided, use the key from the config
726
  huggingface_api_key = loaded_config_data['api_keys'].get('huggingface')
727
+ logging.debug(f"HuggingFace: API key from config: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
728
  if huggingface_api_key:
729
  logging.info("HuggingFace: Using API key from config file")
730
  else:
 
737
  # FIXME
738
  # For example: raise ValueError("No valid Anthropic API key available")
739
 
 
740
  logging.debug(f"HuggingFace: Using API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
741
 
742
  if isinstance(input_data, str) and os.path.isfile(input_data):
 
773
  if temp is None:
774
  temp = 0.1
775
  temp = float(temp)
776
+ huggingface_prompt = f"{custom_prompt_arg}\n\n\n{text}"
777
  logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
778
  data = {
779
+ "inputs": huggingface_prompt,
780
+ "max_tokens": 4096,
781
+ "stream": False,
782
+ "temperature": temp
783
  }
784
 
785
  logging.debug("huggingface: Submitting request...")
786
  response = requests.post(API_URL, headers=headers, json=data)
787
 
788
  if response.status_code == 200:
789
+ print(response.json())
790
+ chat_response = response.json()[0]['generated_text'].strip()
791
  logging.debug("huggingface: Summarization successful")
792
+ print("Chat request successful.")
793
+ return chat_response
794
  else:
795
  logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
796
  return f"Failed to process summary, status code {response.status_code}: {response.text}"
App_Function_Libraries/Utils/Utils.py CHANGED
@@ -18,6 +18,7 @@
18
  #
19
  #
20
  ####################
 
21
  # Import necessary libraries
22
  import configparser
23
  import hashlib
@@ -29,14 +30,15 @@ import time
29
  from datetime import timedelta
30
  from typing import Union, AnyStr
31
  from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
32
-
 
33
  import requests
34
  import unicodedata
35
  from tqdm import tqdm
36
-
37
  #######################################################################################################################
38
- # Function Definitions
39
  #
 
40
 
41
  def extract_text_from_segments(segments):
42
  logging.debug(f"Segments received: {segments}")
@@ -63,10 +65,6 @@ def extract_text_from_segments(segments):
63
  logging.error(f"Unable to extract text from segments: {segments}")
64
  return "Error: Unable to extract transcription"
65
 
66
- def import_data(file):
67
- # Implement this function to import data from a file
68
- pass
69
-
70
  #
71
  #
72
  #######################
@@ -124,23 +122,35 @@ def load_comprehensive_config():
124
 
125
 
126
  def get_project_root():
127
- """Get the project root directory."""
128
- return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 
 
129
 
130
  def get_database_dir():
131
- return get_project_relative_path('Databases')
 
 
 
132
 
133
  def get_database_path(db_name: Union[str, os.PathLike[AnyStr]]) -> str:
134
  """Get the full path for a database file."""
135
- return os.path.join(get_database_dir(), str(db_name))
 
 
136
 
137
  def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
138
  """Convert a relative path to a path relative to the project root."""
139
- return os.path.join(get_project_root(), str(relative_path))
 
 
140
 
141
  def get_chromadb_path():
142
- project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
143
- return os.path.join(project_root, 'Databases', 'chroma_db')
 
144
 
145
  def ensure_directory_exists(path):
146
  """Ensure that a directory exists, creating it if necessary."""
@@ -676,3 +686,18 @@ def get_db_config():
676
  #
677
  # End of DB Config Loading
678
  #######################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  #
19
  #
20
  ####################
21
+ #
22
  # Import necessary libraries
23
  import configparser
24
  import hashlib
 
30
  from datetime import timedelta
31
  from typing import Union, AnyStr
32
  from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
33
+ #
34
+ # Non-Local Imports
35
  import requests
36
  import unicodedata
37
  from tqdm import tqdm
38
+ #
39
  #######################################################################################################################
 
40
  #
41
+ # Function Definitions
42
 
43
  def extract_text_from_segments(segments):
44
  logging.debug(f"Segments received: {segments}")
 
65
  logging.error(f"Unable to extract text from segments: {segments}")
66
  return "Error: Unable to extract transcription"
67
 
 
 
 
 
68
  #
69
  #
70
  #######################
 
122
 
123
 
124
  def get_project_root():
125
+ # Get the directory of the current file (Utils.py)
126
+ current_dir = os.path.dirname(os.path.abspath(__file__))
127
+ # Go up two levels to reach the project root
128
+ # Assuming the structure is: project_root/App_Function_Libraries/Utils/Utils.py
129
+ project_root = os.path.dirname(os.path.dirname(current_dir))
130
+ return project_root
131
 
132
  def get_database_dir():
133
+ """Get the database directory (/tldw/Databases/)."""
134
+ db_dir = os.path.join(get_project_root(), 'Databases')
135
+ logging.debug(f"Database directory: {db_dir}")
136
+ return db_dir
137
 
138
  def get_database_path(db_name: Union[str, os.PathLike[AnyStr]]) -> str:
139
  """Get the full path for a database file."""
140
+ path = os.path.join(get_database_dir(), str(db_name))
141
+ logging.debug(f"Database path for {db_name}: {path}")
142
+ return path
143
 
144
  def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
145
  """Convert a relative path to a path relative to the project root."""
146
+ path = os.path.join(get_project_root(), str(relative_path))
147
+ logging.debug(f"Project relative path for {relative_path}: {path}")
148
+ return path
149
 
150
  def get_chromadb_path():
151
+ path = os.path.join(get_project_root(), 'Databases', 'chroma_db')
152
+ logging.debug(f"ChromaDB path: {path}")
153
+ return path
154
 
155
  def ensure_directory_exists(path):
156
  """Ensure that a directory exists, creating it if necessary."""
 
686
  #
687
  # End of DB Config Loading
688
  #######################################################################################################################
689
+
690
+ def format_text_with_line_breaks(text):
691
+ # Split the text into sentences and add line breaks
692
+ sentences = text.replace('. ', '.<br>').replace('? ', '?<br>').replace('! ', '!<br>')
693
+ return sentences
694
+
695
+ #######################################################################################################################
696
+ #
697
+ # File Handling Functions
698
+
699
+
700
+
701
+ #
702
+ # End of File Handling Functions
703
+ #######################################################################################################################
App_Function_Libraries/Utils/__init__.py CHANGED
@@ -1,5 +0,0 @@
1
- from .Utils import is_valid_url, load_and_log_configs, extract_text_from_segments, load_comprehensive_config, format_metadata_as_text, convert_to_seconds, save_to_file,\
2
- save_segments_to_json, download_file, create_download_directory, safe_read_file, generate_unique_filename, generate_unique_identifier, is_valid_url, verify_checksum,\
3
- normalize_title, clean_youtube_url, sanitize_filename, format_transcription, format_file_path, get_db_config
4
-
5
- downloaded_files = []