Spaces:
Running
Running
oceansweep
commited on
Commit
•
45e1f81
1
Parent(s):
354deab
Upload 28 files
Browse files- App_Function_Libraries/Books/Book_Ingestion_Lib.py +170 -0
- App_Function_Libraries/Books/__init__.py +0 -0
- App_Function_Libraries/DB/DB_Manager.py +88 -4
- App_Function_Libraries/DB/SQLite_DB.py +405 -60
- App_Function_Libraries/DB/Test_SQLite_DB.py +202 -0
- App_Function_Libraries/Gradio_Related.py +383 -375
- App_Function_Libraries/MediaWiki/Media_Wiki.py +2 -1
- App_Function_Libraries/Plaintext/Plaintext_Files.py +18 -0
- App_Function_Libraries/RAG/CRAG_Pipeline.py +125 -0
- App_Function_Libraries/RAG/ChromaDB_Library.py +243 -289
- App_Function_Libraries/RAG/Embeddings_Create.py +224 -168
- App_Function_Libraries/RAG/RAG_Libary_2.py +340 -210
- App_Function_Libraries/Summarization/Summarization_General_Lib.py +10 -9
- App_Function_Libraries/Utils/Utils.py +39 -14
- App_Function_Libraries/Utils/__init__.py +0 -5
App_Function_Libraries/Books/Book_Ingestion_Lib.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Book_Ingestion_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Library to hold functions for ingesting book files.#
|
4 |
+
#
|
5 |
+
####################
|
6 |
+
# Function List
|
7 |
+
#
|
8 |
+
# 1. ingest_text_file(file_path, title=None, author=None, keywords=None):
|
9 |
+
# 2.
|
10 |
+
#
|
11 |
+
#
|
12 |
+
####################
|
13 |
+
#
|
14 |
+
# Import necessary libraries
|
15 |
+
import os
|
16 |
+
import re
|
17 |
+
from datetime import datetime
|
18 |
+
import logging
|
19 |
+
|
20 |
+
import ebooklib
|
21 |
+
from bs4 import BeautifulSoup
|
22 |
+
from ebooklib import epub
|
23 |
+
#
|
24 |
+
# Import Local
|
25 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
26 |
+
#
|
27 |
+
#######################################################################################################################
|
28 |
+
# Function Definitions
|
29 |
+
#
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
def read_epub(file_path):
|
34 |
+
"""Read and extract text from an EPUB file."""
|
35 |
+
book = epub.read_epub(file_path)
|
36 |
+
chapters = []
|
37 |
+
for item in book.get_items():
|
38 |
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
39 |
+
chapters.append(item.get_content())
|
40 |
+
|
41 |
+
text = ""
|
42 |
+
for html_content in chapters:
|
43 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
44 |
+
text += soup.get_text() + "\n\n"
|
45 |
+
return text
|
46 |
+
|
47 |
+
|
48 |
+
# Ingest a text file into the database with Title/Author/Keywords
|
49 |
+
def extract_epub_metadata(content):
|
50 |
+
title_match = re.search(r'Title:\s*(.*?)\n', content)
|
51 |
+
author_match = re.search(r'Author:\s*(.*?)\n', content)
|
52 |
+
|
53 |
+
title = title_match.group(1) if title_match else None
|
54 |
+
author = author_match.group(1) if author_match else None
|
55 |
+
|
56 |
+
return title, author
|
57 |
+
|
58 |
+
|
59 |
+
def ingest_text_file(file_path, title=None, author=None, keywords=None):
|
60 |
+
try:
|
61 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
62 |
+
content = file.read()
|
63 |
+
|
64 |
+
# Check if it's a converted epub and extract metadata if so
|
65 |
+
if 'epub_converted' in (keywords or ''):
|
66 |
+
extracted_title, extracted_author = extract_epub_metadata(content)
|
67 |
+
title = title or extracted_title
|
68 |
+
author = author or extracted_author
|
69 |
+
|
70 |
+
# If title is still not provided, use the filename without extension
|
71 |
+
if not title:
|
72 |
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
73 |
+
|
74 |
+
# If author is still not provided, set it to 'Unknown'
|
75 |
+
if not author:
|
76 |
+
author = 'Unknown'
|
77 |
+
|
78 |
+
# If keywords are not provided, use a default keyword
|
79 |
+
if not keywords:
|
80 |
+
keywords = 'text_file,epub_converted'
|
81 |
+
else:
|
82 |
+
keywords = f'text_file,epub_converted,{keywords}'
|
83 |
+
|
84 |
+
# Add the text file to the database
|
85 |
+
add_media_with_keywords(
|
86 |
+
url=file_path,
|
87 |
+
title=title,
|
88 |
+
media_type='document',
|
89 |
+
content=content,
|
90 |
+
keywords=keywords,
|
91 |
+
prompt='No prompt for text files',
|
92 |
+
summary='No summary for text files',
|
93 |
+
transcription_model='None',
|
94 |
+
author=author,
|
95 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
96 |
+
)
|
97 |
+
|
98 |
+
return f"Text file '{title}' by {author} ingested successfully."
|
99 |
+
except Exception as e:
|
100 |
+
logging.error(f"Error ingesting text file: {str(e)}")
|
101 |
+
return f"Error ingesting text file: {str(e)}"
|
102 |
+
|
103 |
+
|
104 |
+
def ingest_folder(folder_path, keywords=None):
|
105 |
+
results = []
|
106 |
+
for filename in os.listdir(folder_path):
|
107 |
+
if filename.lower().endswith('.txt'):
|
108 |
+
file_path = os.path.join(folder_path, filename)
|
109 |
+
result = ingest_text_file(file_path, keywords=keywords)
|
110 |
+
results.append(result)
|
111 |
+
|
112 |
+
|
113 |
+
def epub_to_markdown(epub_path):
|
114 |
+
book = epub.read_epub(epub_path)
|
115 |
+
markdown_content = "# Table of Contents\n\n"
|
116 |
+
chapters = []
|
117 |
+
|
118 |
+
# Extract and format the table of contents
|
119 |
+
toc = book.toc
|
120 |
+
for item in toc:
|
121 |
+
if isinstance(item, tuple):
|
122 |
+
section, children = item
|
123 |
+
level = 1
|
124 |
+
markdown_content += format_toc_item(section, level)
|
125 |
+
for child in children:
|
126 |
+
markdown_content += format_toc_item(child, level + 1)
|
127 |
+
else:
|
128 |
+
markdown_content += format_toc_item(item, 1)
|
129 |
+
|
130 |
+
markdown_content += "\n---\n\n"
|
131 |
+
|
132 |
+
# Process each chapter
|
133 |
+
for item in book.get_items():
|
134 |
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
135 |
+
chapter_content = item.get_content().decode('utf-8')
|
136 |
+
soup = BeautifulSoup(chapter_content, 'html.parser')
|
137 |
+
|
138 |
+
# Extract chapter title
|
139 |
+
title = soup.find(['h1', 'h2', 'h3'])
|
140 |
+
if title:
|
141 |
+
chapter_title = title.get_text()
|
142 |
+
markdown_content += f"# {chapter_title}\n\n"
|
143 |
+
|
144 |
+
# Process chapter content
|
145 |
+
for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
|
146 |
+
if elem.name.startswith('h'):
|
147 |
+
level = int(elem.name[1])
|
148 |
+
markdown_content += f"{'#' * level} {elem.get_text()}\n\n"
|
149 |
+
elif elem.name == 'p':
|
150 |
+
markdown_content += f"{elem.get_text()}\n\n"
|
151 |
+
elif elem.name in ['ul', 'ol']:
|
152 |
+
for li in elem.find_all('li'):
|
153 |
+
markdown_content += f"- {li.get_text()}\n"
|
154 |
+
markdown_content += "\n"
|
155 |
+
|
156 |
+
markdown_content += "---\n\n"
|
157 |
+
|
158 |
+
return markdown_content
|
159 |
+
|
160 |
+
|
161 |
+
def format_toc_item(item, level):
|
162 |
+
return f"{' ' * (level - 1)}- [{item.title}](#{slugify(item.title)})\n"
|
163 |
+
|
164 |
+
|
165 |
+
def slugify(text):
|
166 |
+
return re.sub(r'[\W_]+', '-', text.lower())
|
167 |
+
|
168 |
+
#
|
169 |
+
# End of Function Definitions
|
170 |
+
#######################################################################################################################
|
App_Function_Libraries/Books/__init__.py
ADDED
File without changes
|
App_Function_Libraries/DB/DB_Manager.py
CHANGED
@@ -5,8 +5,8 @@
|
|
5 |
import configparser
|
6 |
import os
|
7 |
import logging
|
8 |
-
from typing import Tuple, List, Union, Dict
|
9 |
import time
|
|
|
10 |
#
|
11 |
# 3rd-Party Libraries
|
12 |
from elasticsearch import Elasticsearch
|
@@ -64,7 +64,11 @@ from App_Function_Libraries.DB.SQLite_DB import (
|
|
64 |
get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
|
65 |
get_all_content_from_database as sqlite_get_all_content_from_database,
|
66 |
get_next_media_id as sqlite_get_next_media_id, \
|
67 |
-
batch_insert_chunks as sqlite_batch_insert_chunks, Database,
|
|
|
|
|
|
|
|
|
68 |
)
|
69 |
#
|
70 |
# Local Imports
|
@@ -320,6 +324,19 @@ def add_media_to_database(*args, **kwargs):
|
|
320 |
# Implement Elasticsearch version
|
321 |
raise NotImplementedError("Elasticsearch version of add_media_to_database not yet implemented")
|
322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
def import_obsidian_note_to_db(*args, **kwargs):
|
325 |
if db_type == 'sqlite':
|
@@ -501,6 +518,24 @@ def mark_as_trash(media_id: int) -> None:
|
|
501 |
else:
|
502 |
raise ValueError(f"Unsupported database type: {db_type}")
|
503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
def get_media_content(media_id: int) -> str:
|
505 |
if db_type == 'sqlite':
|
506 |
return sqlite_get_media_content(media_id)
|
@@ -541,6 +576,29 @@ def get_specific_summary(summary_id: int) -> Dict:
|
|
541 |
else:
|
542 |
raise ValueError(f"Unsupported database type: {db_type}")
|
543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
def get_media_prompts(media_id: int) -> List[Dict]:
|
545 |
if db_type == 'sqlite':
|
546 |
return sqlite_get_media_prompts(media_id)
|
@@ -790,9 +848,35 @@ def get_document_version(*args, **kwargs):
|
|
790 |
# End of Document Versioning Functions
|
791 |
############################################################################################################
|
792 |
|
793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
794 |
if db_type == 'sqlite':
|
795 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
796 |
|
797 |
#
|
798 |
# End of file
|
|
|
5 |
import configparser
|
6 |
import os
|
7 |
import logging
|
|
|
8 |
import time
|
9 |
+
from typing import Tuple, List, Union, Dict
|
10 |
#
|
11 |
# 3rd-Party Libraries
|
12 |
from elasticsearch import Elasticsearch
|
|
|
64 |
get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
|
65 |
get_all_content_from_database as sqlite_get_all_content_from_database,
|
66 |
get_next_media_id as sqlite_get_next_media_id, \
|
67 |
+
batch_insert_chunks as sqlite_batch_insert_chunks, Database, save_workflow_chat_to_db as sqlite_save_workflow_chat_to_db, \
|
68 |
+
get_workflow_chat as sqlite_get_workflow_chat, update_media_content_with_version as sqlite_update_media_content_with_version, \
|
69 |
+
check_existing_media as sqlite_check_existing_media, get_all_document_versions as sqlite_get_all_document_versions, \
|
70 |
+
fetch_paginated_data as sqlite_fetch_paginated_data, get_latest_transcription as sqlite_get_latest_transcription, \
|
71 |
+
|
72 |
)
|
73 |
#
|
74 |
# Local Imports
|
|
|
324 |
# Implement Elasticsearch version
|
325 |
raise NotImplementedError("Elasticsearch version of add_media_to_database not yet implemented")
|
326 |
|
327 |
+
def check_existing_media(*args, **kwargs):
|
328 |
+
if db_type == 'sqlite':
|
329 |
+
return sqlite_check_existing_media(*args, **kwargs)
|
330 |
+
elif db_type == 'elasticsearch':
|
331 |
+
# Implement Elasticsearch version
|
332 |
+
raise NotImplementedError("Elasticsearch version of check_existing_media not yet implemented")
|
333 |
+
|
334 |
+
def update_media_content_with_version(*args, **kwargs):
|
335 |
+
if db_type == 'sqlite':
|
336 |
+
return sqlite_update_media_content_with_version(*args, **kwargs)
|
337 |
+
elif db_type == 'elasticsearch':
|
338 |
+
# Implement Elasticsearch version
|
339 |
+
raise NotImplementedError("Elasticsearch version of update_media_content not yet implemented")
|
340 |
|
341 |
def import_obsidian_note_to_db(*args, **kwargs):
|
342 |
if db_type == 'sqlite':
|
|
|
518 |
else:
|
519 |
raise ValueError(f"Unsupported database type: {db_type}")
|
520 |
|
521 |
+
|
522 |
+
def get_latest_transcription(*args, **kwargs):
|
523 |
+
if db_type == 'sqlite':
|
524 |
+
return sqlite_get_latest_transcription(*args, **kwargs)
|
525 |
+
elif db_type == 'elasticsearch':
|
526 |
+
# Implement Elasticsearch version
|
527 |
+
raise NotImplementedError("Elasticsearch version of get_latest_transcription not yet implemented")
|
528 |
+
|
529 |
+
def fetch_paginated_data(*args, **kwargs):
|
530 |
+
if db_type == 'sqlite':
|
531 |
+
return sqlite_fetch_paginated_data(*args, **kwargs)
|
532 |
+
elif db_type == 'elasticsearch':
|
533 |
+
# Implement Elasticsearch version
|
534 |
+
raise NotImplementedError("Elasticsearch version of fetch_paginated_data not yet implemented")
|
535 |
+
else:
|
536 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
537 |
+
|
538 |
+
|
539 |
def get_media_content(media_id: int) -> str:
|
540 |
if db_type == 'sqlite':
|
541 |
return sqlite_get_media_content(media_id)
|
|
|
576 |
else:
|
577 |
raise ValueError(f"Unsupported database type: {db_type}")
|
578 |
|
579 |
+
def fetch_item_details_single(*args, **kwargs):
|
580 |
+
if db_type == 'sqlite':
|
581 |
+
return sqlite_fetch_item_details(*args, **kwargs)
|
582 |
+
elif db_type == 'elasticsearch':
|
583 |
+
# Implement Elasticsearch version
|
584 |
+
raise NotImplementedError("Elasticsearch version of fetch_item_details not yet implemented")
|
585 |
+
else:
|
586 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
587 |
+
|
588 |
+
def get_all_document_versions(*args, **kwargs):
|
589 |
+
if db_type == 'sqlite':
|
590 |
+
return sqlite_get_all_document_versions(*args, **kwargs)
|
591 |
+
elif db_type == 'elasticsearch':
|
592 |
+
# Implement Elasticsearch version
|
593 |
+
raise NotImplementedError("Elasticsearch version of get_all_document_versions not yet implemented")
|
594 |
+
else:
|
595 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
596 |
+
#
|
597 |
+
#
|
598 |
+
############################################################################################################
|
599 |
+
#
|
600 |
+
# Prompt Functions:
|
601 |
+
|
602 |
def get_media_prompts(media_id: int) -> List[Dict]:
|
603 |
if db_type == 'sqlite':
|
604 |
return sqlite_get_media_prompts(media_id)
|
|
|
848 |
# End of Document Versioning Functions
|
849 |
############################################################################################################
|
850 |
|
851 |
+
|
852 |
+
############################################################################################################
|
853 |
+
#
|
854 |
+
# Workflow Functions
|
855 |
+
|
856 |
+
def get_workflow_chat(*args, **kwargs):
|
857 |
+
if db_type == 'sqlite':
|
858 |
+
return sqlite_get_workflow_chat(*args, **kwargs)
|
859 |
+
elif db_type == 'elasticsearch':
|
860 |
+
# Implement Elasticsearch version
|
861 |
+
raise NotImplementedError("Elasticsearch version of get_workflow_chat not yet implemented")
|
862 |
+
|
863 |
+
|
864 |
+
def save_workflow_chat_to_db(*args, **kwargs):
|
865 |
if db_type == 'sqlite':
|
866 |
+
# FIXME
|
867 |
+
return sqlite_save_workflow_chat_to_db(*args, **kwargs)
|
868 |
+
elif db_type == 'elasticsearch':
|
869 |
+
# Implement Elasticsearch version
|
870 |
+
raise NotImplementedError("Elasticsearch version of save_workflow_chat_to_db not yet implemented")
|
871 |
+
|
872 |
+
#
|
873 |
+
# End of Workflow Functions
|
874 |
+
############################################################################################################
|
875 |
+
|
876 |
+
# Dead code FIXME
|
877 |
+
# def close_connection():
|
878 |
+
# if db_type == 'sqlite':
|
879 |
+
# db.get_connection().close()
|
880 |
|
881 |
#
|
882 |
# End of file
|
App_Function_Libraries/DB/SQLite_DB.py
CHANGED
@@ -46,6 +46,7 @@ import configparser
|
|
46 |
#
|
47 |
# Import necessary libraries
|
48 |
import csv
|
|
|
49 |
import html
|
50 |
import logging
|
51 |
import os
|
@@ -53,9 +54,13 @@ import queue
|
|
53 |
import re
|
54 |
import shutil
|
55 |
import sqlite3
|
|
|
56 |
import traceback
|
|
|
57 |
from datetime import datetime, timedelta
|
58 |
from typing import List, Tuple, Dict, Any, Optional
|
|
|
|
|
59 |
# Local Libraries
|
60 |
from App_Function_Libraries.Utils.Utils import get_project_relative_path, get_database_path, \
|
61 |
get_database_dir
|
@@ -219,39 +224,62 @@ class DatabaseError(Exception):
|
|
219 |
class InputError(Exception):
|
220 |
pass
|
221 |
|
|
|
222 |
class Database:
|
223 |
def __init__(self, db_name='media_summary.db'):
|
224 |
self.db_path = get_database_path(db_name)
|
225 |
-
self.timeout =
|
|
|
226 |
|
|
|
227 |
def get_connection(self):
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
with self.get_connection() as conn:
|
232 |
try:
|
233 |
-
|
234 |
-
|
235 |
-
conn.
|
236 |
-
except
|
237 |
-
|
238 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
def execute_many(self, query: str, params_list: List[Tuple]) -> None:
|
241 |
with self.get_connection() as conn:
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
|
250 |
db = Database()
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
|
|
|
|
|
255 |
|
256 |
|
257 |
# Function to create tables with the new media schema
|
@@ -399,7 +427,6 @@ def create_tables(db) -> None:
|
|
399 |
'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_media_id ON UnvectorizedMediaChunks(media_id)',
|
400 |
'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_is_processed ON UnvectorizedMediaChunks(is_processed)',
|
401 |
'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_chunk_type ON UnvectorizedMediaChunks(chunk_type)',
|
402 |
-
# CREATE UNIQUE INDEX statements
|
403 |
'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_url ON Media(url)',
|
404 |
'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id)',
|
405 |
'CREATE INDEX IF NOT EXISTS idx_document_versions_media_id ON DocumentVersions(media_id)',
|
@@ -426,6 +453,14 @@ def create_tables(db) -> None:
|
|
426 |
|
427 |
create_tables(db)
|
428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
|
430 |
def check_media_exists(title: str, url: str) -> Optional[int]:
|
431 |
try:
|
@@ -560,11 +595,20 @@ def add_keyword(keyword: str) -> int:
|
|
560 |
with db.get_connection() as conn:
|
561 |
cursor = conn.cursor()
|
562 |
try:
|
|
|
563 |
cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
|
|
|
|
|
564 |
cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
|
565 |
keyword_id = cursor.fetchone()[0]
|
566 |
-
|
567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
568 |
conn.commit()
|
569 |
return keyword_id
|
570 |
except sqlite3.IntegrityError as e:
|
@@ -575,6 +619,7 @@ def add_keyword(keyword: str) -> int:
|
|
575 |
raise DatabaseError(f"Error adding keyword: {e}")
|
576 |
|
577 |
|
|
|
578 |
# Function to delete a keyword
|
579 |
def delete_keyword(keyword: str) -> str:
|
580 |
keyword = keyword.strip().lower()
|
@@ -881,10 +926,12 @@ def browse_items(search_query, search_type):
|
|
881 |
|
882 |
|
883 |
# Function to fetch item details
|
|
|
884 |
def fetch_item_details(media_id: int):
|
885 |
try:
|
886 |
with db.get_connection() as conn:
|
887 |
cursor = conn.cursor()
|
|
|
888 |
cursor.execute("""
|
889 |
SELECT prompt, summary
|
890 |
FROM MediaModifications
|
@@ -893,18 +940,19 @@ def fetch_item_details(media_id: int):
|
|
893 |
LIMIT 1
|
894 |
""", (media_id,))
|
895 |
prompt_summary_result = cursor.fetchone()
|
|
|
|
|
896 |
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
|
897 |
content_result = cursor.fetchone()
|
898 |
|
899 |
-
prompt = prompt_summary_result[0] if prompt_summary_result else ""
|
900 |
-
summary = prompt_summary_result[1] if prompt_summary_result else ""
|
901 |
-
content = content_result[0] if content_result else ""
|
902 |
|
903 |
-
return
|
904 |
except sqlite3.Error as e:
|
905 |
logging.error(f"Error fetching item details: {e}")
|
906 |
-
|
907 |
-
return "", "", ""
|
908 |
|
909 |
#
|
910 |
# End of Media-related Functions
|
@@ -916,7 +964,6 @@ def fetch_item_details(media_id: int):
|
|
916 |
# Media-related Functions
|
917 |
|
918 |
|
919 |
-
|
920 |
# Function to add a version of a prompt and summary
|
921 |
def add_media_version(conn, media_id: int, prompt: str, summary: str) -> None:
|
922 |
try:
|
@@ -1113,12 +1160,22 @@ def is_valid_date(date_string: str) -> bool:
|
|
1113 |
except ValueError:
|
1114 |
return False
|
1115 |
|
1116 |
-
|
|
|
|
|
|
|
1117 |
db = Database()
|
1118 |
try:
|
1119 |
with db.get_connection() as conn:
|
1120 |
cursor = conn.cursor()
|
1121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1122 |
# Extract content from segments
|
1123 |
if isinstance(segments, list):
|
1124 |
content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
|
@@ -1140,13 +1197,14 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
|
|
1140 |
existing_media = cursor.fetchone()
|
1141 |
|
1142 |
if existing_media:
|
1143 |
-
|
1144 |
-
|
1145 |
-
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
|
|
|
1150 |
else:
|
1151 |
cursor.execute('''
|
1152 |
INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model, chunking_status)
|
@@ -1186,7 +1244,8 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
|
|
1186 |
# Schedule chunking
|
1187 |
schedule_chunking(media_id, content, info_dict.get('title', 'Untitled'))
|
1188 |
|
1189 |
-
|
|
|
1190 |
|
1191 |
except DatabaseError as e:
|
1192 |
logging.error(f"Database error: {e}")
|
@@ -1196,6 +1255,66 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
|
|
1196 |
raise DatabaseError(f"Unexpected error: {e}")
|
1197 |
|
1198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1199 |
# FIXME: This function is not complete and needs to be implemented
|
1200 |
def schedule_chunking(media_id: int, content: str, media_name: str):
|
1201 |
try:
|
@@ -1622,13 +1741,14 @@ def fetch_item_details_single(media_id: int):
|
|
1622 |
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
|
1623 |
content_result = cursor.fetchone()
|
1624 |
|
1625 |
-
prompt = prompt_summary_result[0] if prompt_summary_result else ""
|
1626 |
-
summary = prompt_summary_result[1] if prompt_summary_result else ""
|
1627 |
-
content = content_result[0] if content_result else ""
|
1628 |
|
1629 |
return prompt, summary, content
|
1630 |
except sqlite3.Error as e:
|
1631 |
-
|
|
|
1632 |
|
1633 |
|
1634 |
|
@@ -2045,6 +2165,22 @@ def get_transcripts(media_id):
|
|
2045 |
logging.error(f"Error in get_transcripts: {str(e)}")
|
2046 |
return []
|
2047 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2048 |
|
2049 |
#
|
2050 |
# End of Functions to Compare Transcripts
|
@@ -2436,32 +2572,57 @@ def get_paginated_files(page: int = 1, results_per_page: int = 50) -> Tuple[List
|
|
2436 |
#
|
2437 |
# Functions to manage document versions
|
2438 |
|
|
|
2439 |
def create_document_version(media_id: int, content: str) -> int:
|
|
|
2440 |
try:
|
2441 |
with db.get_connection() as conn:
|
2442 |
cursor = conn.cursor()
|
|
|
|
|
|
|
2443 |
|
2444 |
-
|
2445 |
-
|
2446 |
-
|
2447 |
-
|
2448 |
-
|
2449 |
-
|
|
|
|
|
|
|
|
|
2450 |
|
2451 |
-
|
2452 |
-
|
2453 |
|
2454 |
-
|
2455 |
-
|
2456 |
-
INSERT INTO DocumentVersions (media_id, version_number, content)
|
2457 |
-
VALUES (?, ?, ?)
|
2458 |
-
''', (media_id, new_version, content))
|
2459 |
|
2460 |
-
|
2461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2462 |
except sqlite3.Error as e:
|
2463 |
-
logging.error(f"
|
2464 |
-
|
|
|
|
|
|
|
|
|
|
|
2465 |
|
2466 |
|
2467 |
def get_document_version(media_id: int, version_number: int = None) -> Dict[str, Any]:
|
@@ -2501,6 +2662,91 @@ def get_document_version(media_id: int, version_number: int = None) -> Dict[str,
|
|
2501 |
logging.error(error_message)
|
2502 |
return {'error': error_message}
|
2503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2504 |
#
|
2505 |
# End of Functions to manage document versions
|
2506 |
#######################################################################################################################
|
@@ -2653,3 +2899,102 @@ def update_media_table(db):
|
|
2653 |
#
|
2654 |
# End of Functions to manage media chunks
|
2655 |
#######################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
#
|
47 |
# Import necessary libraries
|
48 |
import csv
|
49 |
+
import hashlib
|
50 |
import html
|
51 |
import logging
|
52 |
import os
|
|
|
54 |
import re
|
55 |
import shutil
|
56 |
import sqlite3
|
57 |
+
import threading
|
58 |
import traceback
|
59 |
+
from contextlib import contextmanager
|
60 |
from datetime import datetime, timedelta
|
61 |
from typing import List, Tuple, Dict, Any, Optional
|
62 |
+
from urllib.parse import quote
|
63 |
+
|
64 |
# Local Libraries
|
65 |
from App_Function_Libraries.Utils.Utils import get_project_relative_path, get_database_path, \
|
66 |
get_database_dir
|
|
|
224 |
class InputError(Exception):
|
225 |
pass
|
226 |
|
227 |
+
|
228 |
class Database:
|
229 |
def __init__(self, db_name='media_summary.db'):
|
230 |
self.db_path = get_database_path(db_name)
|
231 |
+
self.timeout = 10.0
|
232 |
+
self._local = threading.local()
|
233 |
|
234 |
+
@contextmanager
|
235 |
def get_connection(self):
|
236 |
+
if not hasattr(self._local, 'connection') or self._local.connection is None:
|
237 |
+
self._local.connection = sqlite3.connect(self.db_path, timeout=self.timeout)
|
238 |
+
self._local.connection.isolation_level = None # This enables autocommit mode
|
239 |
+
yield self._local.connection
|
240 |
+
|
241 |
+
def close_connection(self):
|
242 |
+
if hasattr(self._local, 'connection') and self._local.connection:
|
243 |
+
self._local.connection.close()
|
244 |
+
self._local.connection = None
|
245 |
+
|
246 |
+
@contextmanager
|
247 |
+
def transaction(self):
|
248 |
with self.get_connection() as conn:
|
249 |
try:
|
250 |
+
conn.execute("BEGIN")
|
251 |
+
yield conn
|
252 |
+
conn.execute("COMMIT")
|
253 |
+
except Exception:
|
254 |
+
conn.execute("ROLLBACK")
|
255 |
+
raise
|
256 |
+
|
257 |
+
def execute_query(self, query: str, params: Tuple = ()) -> Any:
|
258 |
+
with self.get_connection() as conn:
|
259 |
+
cursor = conn.cursor()
|
260 |
+
cursor.execute(query, params)
|
261 |
+
if query.strip().upper().startswith("SELECT"):
|
262 |
+
return cursor.fetchall()
|
263 |
+
else:
|
264 |
+
return cursor.rowcount
|
265 |
|
266 |
def execute_many(self, query: str, params_list: List[Tuple]) -> None:
|
267 |
with self.get_connection() as conn:
|
268 |
+
cursor = conn.cursor()
|
269 |
+
cursor.executemany(query, params_list)
|
270 |
+
|
271 |
+
def table_exists(self, table_name: str) -> bool:
|
272 |
+
query = 'SELECT name FROM sqlite_master WHERE type="table" AND name=?'
|
273 |
+
result = self.execute_query(query, (table_name,))
|
274 |
+
return bool(result)
|
275 |
|
276 |
db = Database()
|
277 |
|
278 |
+
# Usage example:
|
279 |
+
if db.table_exists('DocumentVersions'):
|
280 |
+
logging.info("DocumentVersions table exists")
|
281 |
+
else:
|
282 |
+
logging.error("DocumentVersions table does not exist")
|
283 |
|
284 |
|
285 |
# Function to create tables with the new media schema
|
|
|
427 |
'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_media_id ON UnvectorizedMediaChunks(media_id)',
|
428 |
'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_is_processed ON UnvectorizedMediaChunks(is_processed)',
|
429 |
'CREATE INDEX IF NOT EXISTS idx_unvectorized_media_chunks_chunk_type ON UnvectorizedMediaChunks(chunk_type)',
|
|
|
430 |
'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_url ON Media(url)',
|
431 |
'CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id)',
|
432 |
'CREATE INDEX IF NOT EXISTS idx_document_versions_media_id ON DocumentVersions(media_id)',
|
|
|
453 |
|
454 |
create_tables(db)
|
455 |
|
456 |
+
#
|
457 |
+
# End of DB Setup Functions
|
458 |
+
#######################################################################################################################
|
459 |
+
|
460 |
+
|
461 |
+
#######################################################################################################################
|
462 |
+
#
|
463 |
+
# Media-related Functions
|
464 |
|
465 |
def check_media_exists(title: str, url: str) -> Optional[int]:
|
466 |
try:
|
|
|
595 |
with db.get_connection() as conn:
|
596 |
cursor = conn.cursor()
|
597 |
try:
|
598 |
+
# Insert into Keywords table
|
599 |
cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
|
600 |
+
|
601 |
+
# Get the keyword_id (whether it was just inserted or already existed)
|
602 |
cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
|
603 |
keyword_id = cursor.fetchone()[0]
|
604 |
+
|
605 |
+
# Check if the keyword exists in keyword_fts
|
606 |
+
cursor.execute('SELECT rowid FROM keyword_fts WHERE rowid = ?', (keyword_id,))
|
607 |
+
if not cursor.fetchone():
|
608 |
+
# If it doesn't exist in keyword_fts, insert it
|
609 |
+
cursor.execute('INSERT OR IGNORE INTO keyword_fts (rowid, keyword) VALUES (?, ?)', (keyword_id, keyword))
|
610 |
+
|
611 |
+
logging.info(f"Keyword '{keyword}' added or updated with ID: {keyword_id}")
|
612 |
conn.commit()
|
613 |
return keyword_id
|
614 |
except sqlite3.IntegrityError as e:
|
|
|
619 |
raise DatabaseError(f"Error adding keyword: {e}")
|
620 |
|
621 |
|
622 |
+
|
623 |
# Function to delete a keyword
|
624 |
def delete_keyword(keyword: str) -> str:
|
625 |
keyword = keyword.strip().lower()
|
|
|
926 |
|
927 |
|
928 |
# Function to fetch item details
|
929 |
+
|
930 |
def fetch_item_details(media_id: int):
|
931 |
try:
|
932 |
with db.get_connection() as conn:
|
933 |
cursor = conn.cursor()
|
934 |
+
# Fetch the latest prompt and summary from MediaModifications
|
935 |
cursor.execute("""
|
936 |
SELECT prompt, summary
|
937 |
FROM MediaModifications
|
|
|
940 |
LIMIT 1
|
941 |
""", (media_id,))
|
942 |
prompt_summary_result = cursor.fetchone()
|
943 |
+
|
944 |
+
# Fetch the latest transcription
|
945 |
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
|
946 |
content_result = cursor.fetchone()
|
947 |
|
948 |
+
prompt = prompt_summary_result[0] if prompt_summary_result else "No prompt available."
|
949 |
+
summary = prompt_summary_result[1] if prompt_summary_result else "No summary available."
|
950 |
+
content = content_result[0] if content_result else "No content available."
|
951 |
|
952 |
+
return prompt, summary, content
|
953 |
except sqlite3.Error as e:
|
954 |
logging.error(f"Error fetching item details: {e}")
|
955 |
+
return "Error fetching prompt.", "Error fetching summary.", "Error fetching media."
|
|
|
956 |
|
957 |
#
|
958 |
# End of Media-related Functions
|
|
|
964 |
# Media-related Functions
|
965 |
|
966 |
|
|
|
967 |
# Function to add a version of a prompt and summary
|
968 |
def add_media_version(conn, media_id: int, prompt: str, summary: str) -> None:
|
969 |
try:
|
|
|
1160 |
except ValueError:
|
1161 |
return False
|
1162 |
|
1163 |
+
|
1164 |
+
|
1165 |
+
|
1166 |
+
def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video', overwrite=False):
|
1167 |
db = Database()
|
1168 |
try:
|
1169 |
with db.get_connection() as conn:
|
1170 |
cursor = conn.cursor()
|
1171 |
|
1172 |
+
# Generate URL if not provided
|
1173 |
+
if not url:
|
1174 |
+
title = info_dict.get('title', 'Untitled')
|
1175 |
+
url_hash = hashlib.md5(f"{title}{media_type}".encode()).hexdigest()
|
1176 |
+
url = f"https://No-URL-Submitted.com/{media_type}/{quote(title)}-{url_hash}"
|
1177 |
+
|
1178 |
+
|
1179 |
# Extract content from segments
|
1180 |
if isinstance(segments, list):
|
1181 |
content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
|
|
|
1197 |
existing_media = cursor.fetchone()
|
1198 |
|
1199 |
if existing_media:
|
1200 |
+
if overwrite:
|
1201 |
+
media_id = existing_media[0]
|
1202 |
+
cursor.execute('''
|
1203 |
+
UPDATE Media
|
1204 |
+
SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?, chunking_status = ?
|
1205 |
+
WHERE id = ?
|
1206 |
+
''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
|
1207 |
+
info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
|
1208 |
else:
|
1209 |
cursor.execute('''
|
1210 |
INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model, chunking_status)
|
|
|
1244 |
# Schedule chunking
|
1245 |
schedule_chunking(media_id, content, info_dict.get('title', 'Untitled'))
|
1246 |
|
1247 |
+
action = "updated" if existing_media and overwrite else "added"
|
1248 |
+
return f"Media '{info_dict.get('title', 'Untitled')}' {action} successfully with URL: {url} and keywords: {', '.join(keyword_list)}. Chunking scheduled."
|
1249 |
|
1250 |
except DatabaseError as e:
|
1251 |
logging.error(f"Database error: {e}")
|
|
|
1255 |
raise DatabaseError(f"Unexpected error: {e}")
|
1256 |
|
1257 |
|
1258 |
+
def check_existing_media(url):
|
1259 |
+
db = Database()
|
1260 |
+
try:
|
1261 |
+
with db.get_connection() as conn:
|
1262 |
+
cursor = conn.cursor()
|
1263 |
+
cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
|
1264 |
+
result = cursor.fetchone()
|
1265 |
+
return {'id': result[0]} if result else None
|
1266 |
+
except Exception as e:
|
1267 |
+
logging.error(f"Error checking existing media: {e}")
|
1268 |
+
return None
|
1269 |
+
|
1270 |
+
|
1271 |
+
# Modified update_media_content function to create a new version
|
1272 |
+
def update_media_content_with_version(media_id, info_dict, content_input, prompt_input, summary_input, whisper_model):
|
1273 |
+
db = Database()
|
1274 |
+
try:
|
1275 |
+
with db.get_connection() as conn:
|
1276 |
+
cursor = conn.cursor()
|
1277 |
+
|
1278 |
+
# Create new document version
|
1279 |
+
cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
|
1280 |
+
current_version = cursor.fetchone()[0] or 0
|
1281 |
+
new_version = current_version + 1
|
1282 |
+
|
1283 |
+
# Insert new version
|
1284 |
+
cursor.execute('''
|
1285 |
+
INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
|
1286 |
+
VALUES (?, ?, ?, ?, ?)
|
1287 |
+
''', (media_id, new_version, prompt_input, summary_input, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
1288 |
+
|
1289 |
+
# Update the main content in the Media table
|
1290 |
+
cursor.execute('''
|
1291 |
+
UPDATE Media
|
1292 |
+
SET content = ?, transcription_model = ?, title = ?, author = ?, ingestion_date = ?, chunking_status = ?
|
1293 |
+
WHERE id = ?
|
1294 |
+
''', (content_input, whisper_model, info_dict.get('title', 'Untitled'),
|
1295 |
+
info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
|
1296 |
+
|
1297 |
+
# Update or insert into MediaModifications
|
1298 |
+
cursor.execute('''
|
1299 |
+
INSERT OR REPLACE INTO MediaModifications (media_id, prompt, summary, modification_date)
|
1300 |
+
VALUES (?, ?, ?, ?)
|
1301 |
+
''', (media_id, prompt_input, summary_input, datetime.now().strftime('%Y-%m-%d')))
|
1302 |
+
|
1303 |
+
# Update full-text search index
|
1304 |
+
cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
|
1305 |
+
(media_id, info_dict.get('title', 'Untitled'), content_input))
|
1306 |
+
|
1307 |
+
conn.commit()
|
1308 |
+
|
1309 |
+
# Schedule chunking
|
1310 |
+
schedule_chunking(media_id, content_input, info_dict.get('title', 'Untitled'))
|
1311 |
+
|
1312 |
+
return f"Content updated successfully for media ID: {media_id}. New version: {new_version}"
|
1313 |
+
except Exception as e:
|
1314 |
+
logging.error(f"Error updating media content: {e}")
|
1315 |
+
return f"Error updating content: {str(e)}"
|
1316 |
+
|
1317 |
+
|
1318 |
# FIXME: This function is not complete and needs to be implemented
|
1319 |
def schedule_chunking(media_id: int, content: str, media_name: str):
|
1320 |
try:
|
|
|
1741 |
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
|
1742 |
content_result = cursor.fetchone()
|
1743 |
|
1744 |
+
prompt = prompt_summary_result[0] if prompt_summary_result else "No prompt available."
|
1745 |
+
summary = prompt_summary_result[1] if prompt_summary_result else "No summary available."
|
1746 |
+
content = content_result[0] if content_result else "No content available."
|
1747 |
|
1748 |
return prompt, summary, content
|
1749 |
except sqlite3.Error as e:
|
1750 |
+
logging.error(f"Error fetching item details: {e}")
|
1751 |
+
return "Error fetching prompt.", "Error fetching summary.", "Error fetching content."
|
1752 |
|
1753 |
|
1754 |
|
|
|
2165 |
logging.error(f"Error in get_transcripts: {str(e)}")
|
2166 |
return []
|
2167 |
|
2168 |
+
def get_latest_transcription(media_id: int):
|
2169 |
+
try:
|
2170 |
+
with db.get_connection() as conn:
|
2171 |
+
cursor = conn.cursor()
|
2172 |
+
cursor.execute("""
|
2173 |
+
SELECT transcription
|
2174 |
+
FROM Transcripts
|
2175 |
+
WHERE media_id = ?
|
2176 |
+
ORDER BY created_at DESC
|
2177 |
+
LIMIT 1
|
2178 |
+
""", (media_id,))
|
2179 |
+
result = cursor.fetchone()
|
2180 |
+
return result[0] if result else "No transcription available."
|
2181 |
+
except sqlite3.Error as e:
|
2182 |
+
logging.error(f"Error fetching latest transcription: {e}")
|
2183 |
+
return "Error fetching transcription."
|
2184 |
|
2185 |
#
|
2186 |
# End of Functions to Compare Transcripts
|
|
|
2572 |
#
|
2573 |
# Functions to manage document versions
|
2574 |
|
2575 |
+
|
2576 |
def create_document_version(media_id: int, content: str) -> int:
|
2577 |
+
logging.info(f"Attempting to create document version for media_id: {media_id}")
|
2578 |
try:
|
2579 |
with db.get_connection() as conn:
|
2580 |
cursor = conn.cursor()
|
2581 |
+
|
2582 |
+
# Start a transaction
|
2583 |
+
cursor.execute("BEGIN EXCLUSIVE TRANSACTION")
|
2584 |
|
2585 |
+
try:
|
2586 |
+
# Verify media_id exists and get the latest version in one query
|
2587 |
+
cursor.execute('''
|
2588 |
+
SELECT m.id, COALESCE(MAX(dv.version_number), 0)
|
2589 |
+
FROM Media m
|
2590 |
+
LEFT JOIN DocumentVersions dv ON m.id = dv.media_id
|
2591 |
+
WHERE m.id = ?
|
2592 |
+
GROUP BY m.id
|
2593 |
+
''', (media_id,))
|
2594 |
+
result = cursor.fetchone()
|
2595 |
|
2596 |
+
if not result:
|
2597 |
+
raise ValueError(f"No Media entry found for id: {media_id}")
|
2598 |
|
2599 |
+
_, latest_version = result
|
2600 |
+
new_version = latest_version + 1
|
|
|
|
|
|
|
2601 |
|
2602 |
+
logging.debug(f"Inserting new version {new_version} for media_id: {media_id}")
|
2603 |
+
|
2604 |
+
# Insert new version
|
2605 |
+
cursor.execute('''
|
2606 |
+
INSERT INTO DocumentVersions (media_id, version_number, content)
|
2607 |
+
VALUES (?, ?, ?)
|
2608 |
+
''', (media_id, new_version, content))
|
2609 |
+
|
2610 |
+
# Commit the transaction
|
2611 |
+
conn.commit()
|
2612 |
+
logging.info(f"Successfully created document version {new_version} for media_id: {media_id}")
|
2613 |
+
return new_version
|
2614 |
+
except Exception as e:
|
2615 |
+
# If any error occurs, roll back the transaction
|
2616 |
+
conn.rollback()
|
2617 |
+
raise e
|
2618 |
except sqlite3.Error as e:
|
2619 |
+
logging.error(f"Database error creating document version: {e}")
|
2620 |
+
logging.error(f"Error details - media_id: {media_id}, content length: {len(content)}")
|
2621 |
+
raise DatabaseError(f"Failed to create document version: {e}")
|
2622 |
+
except Exception as e:
|
2623 |
+
logging.error(f"Unexpected error creating document version: {e}")
|
2624 |
+
logging.error(f"Error details - media_id: {media_id}, content length: {len(content)}")
|
2625 |
+
raise
|
2626 |
|
2627 |
|
2628 |
def get_document_version(media_id: int, version_number: int = None) -> Dict[str, Any]:
|
|
|
2662 |
logging.error(error_message)
|
2663 |
return {'error': error_message}
|
2664 |
|
2665 |
+
def get_all_document_versions(media_id: int) -> List[Dict[str, Any]]:
|
2666 |
+
try:
|
2667 |
+
with db.get_connection() as conn:
|
2668 |
+
cursor = conn.cursor()
|
2669 |
+
cursor.execute('''
|
2670 |
+
SELECT id, version_number, content, created_at
|
2671 |
+
FROM DocumentVersions
|
2672 |
+
WHERE media_id = ?
|
2673 |
+
ORDER BY version_number DESC
|
2674 |
+
''', (media_id,))
|
2675 |
+
results = cursor.fetchall()
|
2676 |
+
|
2677 |
+
if results:
|
2678 |
+
return [
|
2679 |
+
{
|
2680 |
+
'id': row[0],
|
2681 |
+
'version_number': row[1],
|
2682 |
+
'content': row[2],
|
2683 |
+
'created_at': row[3]
|
2684 |
+
}
|
2685 |
+
for row in results
|
2686 |
+
]
|
2687 |
+
else:
|
2688 |
+
return []
|
2689 |
+
except sqlite3.Error as e:
|
2690 |
+
error_message = f"Error retrieving all document versions: {e}"
|
2691 |
+
logging.error(error_message)
|
2692 |
+
return [{'error': error_message}]
|
2693 |
+
|
2694 |
+
def delete_document_version(media_id: int, version_number: int) -> Dict[str, Any]:
|
2695 |
+
try:
|
2696 |
+
with db.get_connection() as conn:
|
2697 |
+
cursor = conn.cursor()
|
2698 |
+
cursor.execute('''
|
2699 |
+
DELETE FROM DocumentVersions
|
2700 |
+
WHERE media_id = ? AND version_number = ?
|
2701 |
+
''', (media_id, version_number))
|
2702 |
+
conn.commit()
|
2703 |
+
|
2704 |
+
if cursor.rowcount > 0:
|
2705 |
+
return {'success': f"Document version {version_number} for media_id {media_id} deleted successfully"}
|
2706 |
+
else:
|
2707 |
+
return {'error': f"No document version found for media_id {media_id} and version_number {version_number}"}
|
2708 |
+
except sqlite3.Error as e:
|
2709 |
+
error_message = f"Error deleting document version: {e}"
|
2710 |
+
logging.error(error_message)
|
2711 |
+
return {'error': error_message}
|
2712 |
+
|
2713 |
+
def rollback_to_version(media_id: int, version_number: int) -> Dict[str, Any]:
|
2714 |
+
try:
|
2715 |
+
with db.get_connection() as conn:
|
2716 |
+
cursor = conn.cursor()
|
2717 |
+
|
2718 |
+
# Get the content of the version to rollback to
|
2719 |
+
cursor.execute('''
|
2720 |
+
SELECT content
|
2721 |
+
FROM DocumentVersions
|
2722 |
+
WHERE media_id = ? AND version_number = ?
|
2723 |
+
''', (media_id, version_number))
|
2724 |
+
result = cursor.fetchone()
|
2725 |
+
|
2726 |
+
if not result:
|
2727 |
+
return {'error': f"No document version found for media_id {media_id} and version_number {version_number}"}
|
2728 |
+
|
2729 |
+
rollback_content = result[0]
|
2730 |
+
|
2731 |
+
# Create a new version with the content of the version to rollback to
|
2732 |
+
cursor.execute('''
|
2733 |
+
INSERT INTO DocumentVersions (media_id, version_number, content)
|
2734 |
+
VALUES (?, (SELECT COALESCE(MAX(version_number), 0) + 1 FROM DocumentVersions WHERE media_id = ?), ?)
|
2735 |
+
''', (media_id, media_id, rollback_content))
|
2736 |
+
|
2737 |
+
new_version_number = cursor.lastrowid
|
2738 |
+
|
2739 |
+
conn.commit()
|
2740 |
+
|
2741 |
+
return {
|
2742 |
+
'success': f"Rolled back to version {version_number} for media_id {media_id}",
|
2743 |
+
'new_version_number': new_version_number
|
2744 |
+
}
|
2745 |
+
except sqlite3.Error as e:
|
2746 |
+
error_message = f"Error rolling back to document version: {e}"
|
2747 |
+
logging.error(error_message)
|
2748 |
+
return {'error': error_message}
|
2749 |
+
|
2750 |
#
|
2751 |
# End of Functions to manage document versions
|
2752 |
#######################################################################################################################
|
|
|
2899 |
#
|
2900 |
# End of Functions to manage media chunks
|
2901 |
#######################################################################################################################
|
2902 |
+
|
2903 |
+
|
2904 |
+
#######################################################################################################################
|
2905 |
+
#
|
2906 |
+
# Workflow Functions
|
2907 |
+
|
2908 |
+
def save_workflow_chat_to_db(chat_history, workflow_name, conversation_id=None):
|
2909 |
+
try:
|
2910 |
+
with db.get_connection() as conn:
|
2911 |
+
cursor = conn.cursor()
|
2912 |
+
|
2913 |
+
if conversation_id is None:
|
2914 |
+
# Create a new conversation
|
2915 |
+
conversation_name = f"{workflow_name}_Workflow_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
2916 |
+
cursor.execute('''
|
2917 |
+
INSERT INTO ChatConversations (media_id, media_name, conversation_name, created_at, updated_at)
|
2918 |
+
VALUES (NULL, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
2919 |
+
''', (workflow_name, conversation_name))
|
2920 |
+
conversation_id = cursor.lastrowid
|
2921 |
+
else:
|
2922 |
+
# Update existing conversation
|
2923 |
+
cursor.execute('''
|
2924 |
+
UPDATE ChatConversations
|
2925 |
+
SET updated_at = CURRENT_TIMESTAMP
|
2926 |
+
WHERE id = ?
|
2927 |
+
''', (conversation_id,))
|
2928 |
+
|
2929 |
+
# Save messages
|
2930 |
+
for user_msg, ai_msg in chat_history:
|
2931 |
+
if user_msg:
|
2932 |
+
cursor.execute('''
|
2933 |
+
INSERT INTO ChatMessages (conversation_id, sender, message, timestamp)
|
2934 |
+
VALUES (?, 'user', ?, CURRENT_TIMESTAMP)
|
2935 |
+
''', (conversation_id, user_msg))
|
2936 |
+
if ai_msg:
|
2937 |
+
cursor.execute('''
|
2938 |
+
INSERT INTO ChatMessages (conversation_id, sender, message, timestamp)
|
2939 |
+
VALUES (?, 'ai', ?, CURRENT_TIMESTAMP)
|
2940 |
+
''', (conversation_id, ai_msg))
|
2941 |
+
|
2942 |
+
conn.commit()
|
2943 |
+
|
2944 |
+
return conversation_id, f"Chat saved successfully! Conversation ID: {conversation_id}"
|
2945 |
+
except Exception as e:
|
2946 |
+
logging.error(f"Error saving workflow chat to database: {str(e)}")
|
2947 |
+
return None, f"Error saving chat to database: {str(e)}"
|
2948 |
+
|
2949 |
+
|
2950 |
+
def get_workflow_chat(conversation_id):
|
2951 |
+
"""
|
2952 |
+
Retrieve a workflow chat from the database.
|
2953 |
+
|
2954 |
+
Args:
|
2955 |
+
conversation_id: ID of the conversation to retrieve
|
2956 |
+
|
2957 |
+
Returns:
|
2958 |
+
tuple: (chat_history, workflow_name, status_message)
|
2959 |
+
"""
|
2960 |
+
try:
|
2961 |
+
with db.get_connection() as conn:
|
2962 |
+
cursor = conn.cursor()
|
2963 |
+
|
2964 |
+
# Get conversation details
|
2965 |
+
cursor.execute('''
|
2966 |
+
SELECT media_name, conversation_name FROM ChatConversations
|
2967 |
+
WHERE id = ?
|
2968 |
+
''', (conversation_id,))
|
2969 |
+
result = cursor.fetchone()
|
2970 |
+
if not result:
|
2971 |
+
return None, None, "Conversation not found"
|
2972 |
+
|
2973 |
+
workflow_name, conversation_name = result
|
2974 |
+
|
2975 |
+
# Get chat messages
|
2976 |
+
cursor.execute('''
|
2977 |
+
SELECT sender, message FROM ChatMessages
|
2978 |
+
WHERE conversation_id = ?
|
2979 |
+
ORDER BY timestamp
|
2980 |
+
''', (conversation_id,))
|
2981 |
+
messages = cursor.fetchall()
|
2982 |
+
|
2983 |
+
chat_history = []
|
2984 |
+
for sender, message in messages:
|
2985 |
+
if sender == 'user':
|
2986 |
+
chat_history.append((message, None))
|
2987 |
+
else:
|
2988 |
+
if chat_history and chat_history[-1][1] is None:
|
2989 |
+
chat_history[-1] = (chat_history[-1][0], message)
|
2990 |
+
else:
|
2991 |
+
chat_history.append((None, message))
|
2992 |
+
|
2993 |
+
return chat_history, workflow_name, f"Chat retrieved successfully"
|
2994 |
+
except Exception as e:
|
2995 |
+
logging.error(f"Error retrieving workflow chat from database: {str(e)}")
|
2996 |
+
return None, None, f"Error retrieving chat from database: {str(e)}"
|
2997 |
+
|
2998 |
+
#
|
2999 |
+
# End of Workflow Functions
|
3000 |
+
#######################################################################################################################
|
App_Function_Libraries/DB/Test_SQLite_DB.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Test_SQLite_DB.py
|
2 |
+
# Description: Test file for SQLite_DB.py
|
3 |
+
#
|
4 |
+
# Usage: python -m unittest test_sqlite_db.py
|
5 |
+
#
|
6 |
+
# Imports
|
7 |
+
import unittest
|
8 |
+
import sqlite3
|
9 |
+
import threading
|
10 |
+
import time
|
11 |
+
from unittest.mock import patch
|
12 |
+
#
|
13 |
+
# Local Imports
|
14 |
+
from App_Function_Libraries.DB.SQLite_DB import Database, add_media_with_keywords, add_media_version, DatabaseError
|
15 |
+
#
|
16 |
+
#######################################################################################################################
|
17 |
+
#
|
18 |
+
# Functions:
|
19 |
+
|
20 |
+
class TestDatabase(unittest.TestCase):
|
21 |
+
def setUp(self):
|
22 |
+
self.db = Database(':memory:') # Use in-memory database for testing
|
23 |
+
|
24 |
+
def test_connection_management(self):
|
25 |
+
with self.db.get_connection() as conn:
|
26 |
+
self.assertIsInstance(conn, sqlite3.Connection)
|
27 |
+
self.assertEqual(len(self.db.pool), 1)
|
28 |
+
|
29 |
+
def test_execute_query(self):
|
30 |
+
self.db.execute_query("CREATE TABLE test (id INTEGER PRIMARY KEY, name TEXT)")
|
31 |
+
self.db.execute_query("INSERT INTO test (name) VALUES (?)", ("test_name",))
|
32 |
+
with self.db.get_connection() as conn:
|
33 |
+
cursor = conn.cursor()
|
34 |
+
cursor.execute("SELECT name FROM test")
|
35 |
+
result = cursor.fetchone()
|
36 |
+
self.assertEqual(result[0], "test_name")
|
37 |
+
|
38 |
+
def test_execute_many(self):
|
39 |
+
self.db.execute_query("CREATE TABLE test (id INTEGER PRIMARY KEY, name TEXT)")
|
40 |
+
data = [("name1",), ("name2",), ("name3",)]
|
41 |
+
self.db.execute_many("INSERT INTO test (name) VALUES (?)", data)
|
42 |
+
with self.db.get_connection() as conn:
|
43 |
+
cursor = conn.cursor()
|
44 |
+
cursor.execute("SELECT COUNT(*) FROM test")
|
45 |
+
count = cursor.fetchone()[0]
|
46 |
+
self.assertEqual(count, 3)
|
47 |
+
|
48 |
+
def test_connection_retry(self):
|
49 |
+
def lock_database():
|
50 |
+
with self.db.get_connection() as conn:
|
51 |
+
cursor = conn.cursor()
|
52 |
+
cursor.execute("BEGIN EXCLUSIVE TRANSACTION")
|
53 |
+
time.sleep(2) # Hold the lock for 2 seconds
|
54 |
+
|
55 |
+
thread = threading.Thread(target=lock_database)
|
56 |
+
thread.start()
|
57 |
+
time.sleep(0.1) # Give the thread time to acquire the lock
|
58 |
+
|
59 |
+
with self.assertRaises(DatabaseError):
|
60 |
+
self.db.execute_query("SELECT 1") # This should retry and eventually fail
|
61 |
+
|
62 |
+
thread.join()
|
63 |
+
|
64 |
+
class TestAddMediaWithKeywords(unittest.TestCase):
|
65 |
+
def setUp(self):
|
66 |
+
self.db = Database(':memory:')
|
67 |
+
self.db.execute_query("""
|
68 |
+
CREATE TABLE Media (
|
69 |
+
id INTEGER PRIMARY KEY,
|
70 |
+
url TEXT,
|
71 |
+
title TEXT NOT NULL,
|
72 |
+
type TEXT NOT NULL,
|
73 |
+
content TEXT,
|
74 |
+
author TEXT,
|
75 |
+
ingestion_date TEXT,
|
76 |
+
transcription_model TEXT
|
77 |
+
)
|
78 |
+
""")
|
79 |
+
self.db.execute_query("CREATE TABLE Keywords (id INTEGER PRIMARY KEY, keyword TEXT NOT NULL UNIQUE)")
|
80 |
+
self.db.execute_query("""
|
81 |
+
CREATE TABLE MediaKeywords (
|
82 |
+
id INTEGER PRIMARY KEY,
|
83 |
+
media_id INTEGER NOT NULL,
|
84 |
+
keyword_id INTEGER NOT NULL,
|
85 |
+
FOREIGN KEY (media_id) REFERENCES Media(id),
|
86 |
+
FOREIGN KEY (keyword_id) REFERENCES Keywords(id)
|
87 |
+
)
|
88 |
+
""")
|
89 |
+
self.db.execute_query("""
|
90 |
+
CREATE TABLE MediaModifications (
|
91 |
+
id INTEGER PRIMARY KEY,
|
92 |
+
media_id INTEGER NOT NULL,
|
93 |
+
prompt TEXT,
|
94 |
+
summary TEXT,
|
95 |
+
modification_date TEXT,
|
96 |
+
FOREIGN KEY (media_id) REFERENCES Media(id)
|
97 |
+
)
|
98 |
+
""")
|
99 |
+
self.db.execute_query("""
|
100 |
+
CREATE TABLE MediaVersion (
|
101 |
+
id INTEGER PRIMARY KEY,
|
102 |
+
media_id INTEGER NOT NULL,
|
103 |
+
version INTEGER NOT NULL,
|
104 |
+
prompt TEXT,
|
105 |
+
summary TEXT,
|
106 |
+
created_at TEXT NOT NULL,
|
107 |
+
FOREIGN KEY (media_id) REFERENCES Media(id)
|
108 |
+
)
|
109 |
+
""")
|
110 |
+
self.db.execute_query("CREATE VIRTUAL TABLE media_fts USING fts5(title, content)")
|
111 |
+
|
112 |
+
@patch('App_Function_Libraries.DB.SQLite_DB.db', new_callable=lambda: Database(':memory:'))
|
113 |
+
def test_add_new_media(self, mock_db):
|
114 |
+
mock_db.get_connection = self.db.get_connection
|
115 |
+
result = add_media_with_keywords(
|
116 |
+
url="http://example.com",
|
117 |
+
title="Test Title",
|
118 |
+
media_type="article",
|
119 |
+
content="Test content",
|
120 |
+
keywords="test,keyword",
|
121 |
+
prompt="Test prompt",
|
122 |
+
summary="Test summary",
|
123 |
+
transcription_model="Test model",
|
124 |
+
author="Test Author",
|
125 |
+
ingestion_date="2023-01-01"
|
126 |
+
)
|
127 |
+
self.assertIn("added/updated successfully", result)
|
128 |
+
|
129 |
+
with self.db.get_connection() as conn:
|
130 |
+
cursor = conn.cursor()
|
131 |
+
cursor.execute("SELECT COUNT(*) FROM Media")
|
132 |
+
self.assertEqual(cursor.fetchone()[0], 1)
|
133 |
+
|
134 |
+
cursor.execute("SELECT COUNT(*) FROM Keywords")
|
135 |
+
self.assertEqual(cursor.fetchone()[0], 2)
|
136 |
+
|
137 |
+
cursor.execute("SELECT COUNT(*) FROM MediaKeywords")
|
138 |
+
self.assertEqual(cursor.fetchone()[0], 2)
|
139 |
+
|
140 |
+
cursor.execute("SELECT COUNT(*) FROM MediaModifications")
|
141 |
+
self.assertEqual(cursor.fetchone()[0], 1)
|
142 |
+
|
143 |
+
cursor.execute("SELECT COUNT(*) FROM MediaVersion")
|
144 |
+
self.assertEqual(cursor.fetchone()[0], 1)
|
145 |
+
|
146 |
+
@patch('App_Function_Libraries.DB.SQLite_DB.db', new_callable=lambda: Database(':memory:'))
|
147 |
+
def test_update_existing_media(self, mock_db):
|
148 |
+
mock_db.get_connection = self.db.get_connection
|
149 |
+
add_media_with_keywords(
|
150 |
+
url="http://example.com",
|
151 |
+
title="Test Title",
|
152 |
+
media_type="article",
|
153 |
+
content="Test content",
|
154 |
+
keywords="test,keyword",
|
155 |
+
prompt="Test prompt",
|
156 |
+
summary="Test summary",
|
157 |
+
transcription_model="Test model",
|
158 |
+
author="Test Author",
|
159 |
+
ingestion_date="2023-01-01"
|
160 |
+
)
|
161 |
+
|
162 |
+
result = add_media_with_keywords(
|
163 |
+
url="http://example.com",
|
164 |
+
title="Updated Title",
|
165 |
+
media_type="article",
|
166 |
+
content="Updated content",
|
167 |
+
keywords="test,new",
|
168 |
+
prompt="Updated prompt",
|
169 |
+
summary="Updated summary",
|
170 |
+
transcription_model="Updated model",
|
171 |
+
author="Updated Author",
|
172 |
+
ingestion_date="2023-01-02"
|
173 |
+
)
|
174 |
+
|
175 |
+
self.assertIn("added/updated successfully", result)
|
176 |
+
|
177 |
+
with self.db.get_connection() as conn:
|
178 |
+
cursor = conn.cursor()
|
179 |
+
cursor.execute("SELECT COUNT(*) FROM Media")
|
180 |
+
self.assertEqual(cursor.fetchone()[0], 1)
|
181 |
+
|
182 |
+
cursor.execute("SELECT title FROM Media")
|
183 |
+
self.assertEqual(cursor.fetchone()[0], "Updated Title")
|
184 |
+
|
185 |
+
cursor.execute("SELECT COUNT(*) FROM Keywords")
|
186 |
+
self.assertEqual(cursor.fetchone()[0], 3)
|
187 |
+
|
188 |
+
cursor.execute("SELECT COUNT(*) FROM MediaKeywords")
|
189 |
+
self.assertEqual(cursor.fetchone()[0], 3)
|
190 |
+
|
191 |
+
cursor.execute("SELECT COUNT(*) FROM MediaModifications")
|
192 |
+
self.assertEqual(cursor.fetchone()[0], 2)
|
193 |
+
|
194 |
+
cursor.execute("SELECT COUNT(*) FROM MediaVersion")
|
195 |
+
self.assertEqual(cursor.fetchone()[0], 2)
|
196 |
+
|
197 |
+
if __name__ == '__main__':
|
198 |
+
unittest.main()
|
199 |
+
|
200 |
+
#
|
201 |
+
# End of File
|
202 |
+
#######################################################################################################################
|
App_Function_Libraries/Gradio_Related.py
CHANGED
@@ -1,375 +1,383 @@
|
|
1 |
-
# Gradio_Related.py
|
2 |
-
#########################################
|
3 |
-
# Gradio UI Functions Library
|
4 |
-
# I fucking hate Gradio.
|
5 |
-
# Yea, fuck Gradio. https://github.com/gradio-app/gradio/pull/8263 & https://github.com/gradio-app/gradio/issues/7968
|
6 |
-
#
|
7 |
-
#########################################
|
8 |
-
#
|
9 |
-
# Built-In Imports
|
10 |
-
import logging
|
11 |
-
import os
|
12 |
-
#
|
13 |
-
# Import 3rd-Party Libraries
|
14 |
-
import gradio as gr
|
15 |
-
#
|
16 |
-
# Local Imports
|
17 |
-
from App_Function_Libraries.DB.DB_Manager import get_db_config
|
18 |
-
from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
|
19 |
-
from App_Function_Libraries.Gradio_UI.
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
from App_Function_Libraries.Gradio_UI.
|
25 |
-
from App_Function_Libraries.Gradio_UI.
|
26 |
-
from App_Function_Libraries.Gradio_UI.
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
from App_Function_Libraries.Gradio_UI.
|
32 |
-
|
33 |
-
|
34 |
-
from App_Function_Libraries.Gradio_UI.
|
35 |
-
from App_Function_Libraries.Gradio_UI.
|
36 |
-
|
37 |
-
from App_Function_Libraries.Gradio_UI.
|
38 |
-
|
39 |
-
from App_Function_Libraries.Gradio_UI.
|
40 |
-
from App_Function_Libraries.Gradio_UI.
|
41 |
-
from App_Function_Libraries.Gradio_UI.
|
42 |
-
from App_Function_Libraries.Gradio_UI.
|
43 |
-
|
44 |
-
from App_Function_Libraries.Gradio_UI.
|
45 |
-
from App_Function_Libraries.Gradio_UI.
|
46 |
-
|
47 |
-
from App_Function_Libraries.Gradio_UI.
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
from App_Function_Libraries.Gradio_UI.
|
53 |
-
|
54 |
-
from App_Function_Libraries.Gradio_UI.
|
55 |
-
|
56 |
-
|
57 |
-
from App_Function_Libraries.Gradio_UI.
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
#
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
-
|
84 |
-
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
#
|
96 |
-
#
|
97 |
-
|
98 |
-
#
|
99 |
-
#
|
100 |
-
|
101 |
-
# Functions
|
102 |
-
#
|
103 |
-
#
|
104 |
-
#
|
105 |
-
|
106 |
-
#
|
107 |
-
|
108 |
-
#
|
109 |
-
#
|
110 |
-
#
|
111 |
-
#
|
112 |
-
|
113 |
-
#
|
114 |
-
#
|
115 |
-
|
116 |
-
#
|
117 |
-
#
|
118 |
-
#
|
119 |
-
#
|
120 |
-
|
121 |
-
#
|
122 |
-
#
|
123 |
-
|
124 |
-
#
|
125 |
-
#
|
126 |
-
#
|
127 |
-
#
|
128 |
-
|
129 |
-
#
|
130 |
-
#
|
131 |
-
#
|
132 |
-
|
133 |
-
#
|
134 |
-
#
|
135 |
-
#
|
136 |
-
#
|
137 |
-
|
138 |
-
#
|
139 |
-
#
|
140 |
-
|
141 |
-
#
|
142 |
-
#
|
143 |
-
#
|
144 |
-
# Functions
|
145 |
-
|
146 |
-
#
|
147 |
-
#
|
148 |
-
#
|
149 |
-
|
150 |
-
#
|
151 |
-
#
|
152 |
-
#
|
153 |
-
#
|
154 |
-
|
155 |
-
|
156 |
-
#
|
157 |
-
# Functions:
|
158 |
-
#
|
159 |
-
#
|
160 |
-
|
161 |
-
#
|
162 |
-
#
|
163 |
-
#
|
164 |
-
# Functions
|
165 |
-
|
166 |
-
#
|
167 |
-
|
168 |
-
#
|
169 |
-
#
|
170 |
-
#
|
171 |
-
# Functions
|
172 |
-
|
173 |
-
#
|
174 |
-
#
|
175 |
-
#
|
176 |
-
|
177 |
-
#
|
178 |
-
#
|
179 |
-
#
|
180 |
-
# Functions
|
181 |
-
|
182 |
-
#
|
183 |
-
#
|
184 |
-
#
|
185 |
-
|
186 |
-
#
|
187 |
-
#
|
188 |
-
#
|
189 |
-
# Functions
|
190 |
-
|
191 |
-
#
|
192 |
-
#
|
193 |
-
|
194 |
-
#
|
195 |
-
#
|
196 |
-
#
|
197 |
-
#
|
198 |
-
|
199 |
-
|
200 |
-
#
|
201 |
-
#
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
#
|
206 |
-
#
|
207 |
-
|
208 |
-
|
209 |
-
#
|
210 |
-
#
|
211 |
-
#
|
212 |
-
#
|
213 |
-
#
|
214 |
-
# "
|
215 |
-
# "
|
216 |
-
#
|
217 |
-
#
|
218 |
-
#
|
219 |
-
#
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
.result-box
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
with gr.TabItem("
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
with gr.TabItem("
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
with gr.TabItem("
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
with gr.TabItem("
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Gradio_Related.py
|
2 |
+
#########################################
|
3 |
+
# Gradio UI Functions Library
|
4 |
+
# I fucking hate Gradio.
|
5 |
+
# Yea, fuck Gradio. https://github.com/gradio-app/gradio/pull/8263 & https://github.com/gradio-app/gradio/issues/7968
|
6 |
+
#
|
7 |
+
#########################################
|
8 |
+
#
|
9 |
+
# Built-In Imports
|
10 |
+
import logging
|
11 |
+
import os
|
12 |
+
#
|
13 |
+
# Import 3rd-Party Libraries
|
14 |
+
import gradio as gr
|
15 |
+
#
|
16 |
+
# Local Imports
|
17 |
+
from App_Function_Libraries.DB.DB_Manager import get_db_config
|
18 |
+
from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
|
19 |
+
from App_Function_Libraries.Gradio_UI.Book_Ingestion_tab import create_import_book_tab
|
20 |
+
from App_Function_Libraries.Gradio_UI.Character_Interaction_tab import create_character_card_interaction_tab, \
|
21 |
+
create_multiple_character_chat_tab, create_narrator_controlled_conversation_tab
|
22 |
+
from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
|
23 |
+
create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
|
24 |
+
from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
|
25 |
+
from App_Function_Libraries.Gradio_UI.Explain_summarize_tab import create_summarize_explain_tab
|
26 |
+
from App_Function_Libraries.Gradio_UI.Export_Functionality import create_export_tab
|
27 |
+
from App_Function_Libraries.Gradio_UI.Backup_Functionality import create_backup_tab, create_view_backups_tab, \
|
28 |
+
create_restore_backup_tab
|
29 |
+
from App_Function_Libraries.Gradio_UI.Import_Functionality import create_import_single_prompt_tab, \
|
30 |
+
create_import_obsidian_vault_tab, create_import_item_tab, create_import_multiple_prompts_tab
|
31 |
+
from App_Function_Libraries.Gradio_UI.Introduction_tab import create_introduction_tab
|
32 |
+
from App_Function_Libraries.Gradio_UI.Keywords import create_view_keywords_tab, create_add_keyword_tab, \
|
33 |
+
create_delete_keyword_tab, create_export_keywords_tab
|
34 |
+
from App_Function_Libraries.Gradio_UI.Live_Recording import create_live_recording_tab
|
35 |
+
from App_Function_Libraries.Gradio_UI.Llamafile_tab import create_chat_with_llamafile_tab
|
36 |
+
from App_Function_Libraries.Gradio_UI.MMLU_Pro_tab import create_mmlu_pro_tab
|
37 |
+
from App_Function_Libraries.Gradio_UI.Media_edit import create_prompt_clone_tab, create_prompt_edit_tab, \
|
38 |
+
create_media_edit_and_clone_tab, create_media_edit_tab
|
39 |
+
from App_Function_Libraries.Gradio_UI.Media_wiki_tab import create_mediawiki_import_tab, create_mediawiki_config_tab
|
40 |
+
from App_Function_Libraries.Gradio_UI.PDF_ingestion_tab import create_pdf_ingestion_tab, create_pdf_ingestion_test_tab
|
41 |
+
from App_Function_Libraries.Gradio_UI.Plaintext_tab_import import create_plain_text_import_tab
|
42 |
+
from App_Function_Libraries.Gradio_UI.Podcast_tab import create_podcast_tab
|
43 |
+
from App_Function_Libraries.Gradio_UI.RAG_QA_Chat_tab import create_rag_qa_chat_tab
|
44 |
+
from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
|
45 |
+
from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_search_tab, \
|
46 |
+
create_search_summaries_tab, create_search_tab
|
47 |
+
from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab
|
48 |
+
from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \
|
49 |
+
create_purge_embeddings_tab
|
50 |
+
from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
|
51 |
+
create_delete_trash_tab, create_search_and_mark_trash_tab
|
52 |
+
from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
|
53 |
+
create_utilities_yt_video_tab
|
54 |
+
from App_Function_Libraries.Gradio_UI.Video_transcription_tab import create_video_transcription_tab
|
55 |
+
from App_Function_Libraries.Gradio_UI.View_tab import create_manage_items_tab
|
56 |
+
from App_Function_Libraries.Gradio_UI.Website_scraping_tab import create_website_scraping_tab
|
57 |
+
from App_Function_Libraries.Gradio_UI.Chat_Workflows import chat_workflows_tab
|
58 |
+
from App_Function_Libraries.Gradio_UI.View_DB_Items_tab import create_prompt_view_tab, create_viewing_tab, \
|
59 |
+
create_view_all_with_versions_tab, create_viewing_tab
|
60 |
+
#
|
61 |
+
# Gradio UI Imports
|
62 |
+
from App_Function_Libraries.Gradio_UI.Evaluations_Benchmarks_tab import create_geval_tab, create_infinite_bench_tab
|
63 |
+
|
64 |
+
#
|
65 |
+
#######################################################################################################################
|
66 |
+
# Function Definitions
|
67 |
+
#
|
68 |
+
|
69 |
+
|
70 |
+
# Disable Gradio Analytics
|
71 |
+
os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
|
72 |
+
|
73 |
+
|
74 |
+
custom_prompt_input = None
|
75 |
+
server_mode = False
|
76 |
+
share_public = False
|
77 |
+
custom_prompt_summarize_bulleted_notes = ("""
|
78 |
+
<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
79 |
+
**Bulleted Note Creation Guidelines**
|
80 |
+
|
81 |
+
**Headings**:
|
82 |
+
- Based on referenced topics, not categories like quotes or terms
|
83 |
+
- Surrounded by **bold** formatting
|
84 |
+
- Not listed as bullet points
|
85 |
+
- No space between headings and list items underneath
|
86 |
+
|
87 |
+
**Emphasis**:
|
88 |
+
- **Important terms** set in bold font
|
89 |
+
- **Text ending in a colon**: also bolded
|
90 |
+
|
91 |
+
**Review**:
|
92 |
+
- Ensure adherence to specified format
|
93 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
94 |
+
""")
|
95 |
+
#
|
96 |
+
# End of globals
|
97 |
+
#######################################################################################################################
|
98 |
+
#
|
99 |
+
# Start of Video/Audio Transcription and Summarization Functions
|
100 |
+
#
|
101 |
+
# Functions:
|
102 |
+
# FIXME
|
103 |
+
#
|
104 |
+
#
|
105 |
+
################################################################################################################
|
106 |
+
# Functions for Re-Summarization
|
107 |
+
#
|
108 |
+
# Functions:
|
109 |
+
# FIXME
|
110 |
+
# End of Re-Summarization Functions
|
111 |
+
#
|
112 |
+
############################################################################################################################################################################################################################
|
113 |
+
#
|
114 |
+
# Explain/Summarize This Tab
|
115 |
+
#
|
116 |
+
# Functions:
|
117 |
+
# FIXME
|
118 |
+
#
|
119 |
+
#
|
120 |
+
############################################################################################################################################################################################################################
|
121 |
+
#
|
122 |
+
# Transcript Comparison Tab
|
123 |
+
#
|
124 |
+
# Functions:
|
125 |
+
# FIXME
|
126 |
+
#
|
127 |
+
#
|
128 |
+
###########################################################################################################################################################################################################################
|
129 |
+
#
|
130 |
+
# Search Tab
|
131 |
+
#
|
132 |
+
# Functions:
|
133 |
+
# FIXME
|
134 |
+
#
|
135 |
+
# End of Search Tab Functions
|
136 |
+
#
|
137 |
+
##############################################################################################################################################################################################################################
|
138 |
+
#
|
139 |
+
# Llamafile Tab
|
140 |
+
#
|
141 |
+
# Functions:
|
142 |
+
# FIXME
|
143 |
+
#
|
144 |
+
# End of Llamafile Tab Functions
|
145 |
+
##############################################################################################################################################################################################################################
|
146 |
+
#
|
147 |
+
# Chat Interface Tab Functions
|
148 |
+
#
|
149 |
+
# Functions:
|
150 |
+
# FIXME
|
151 |
+
#
|
152 |
+
#
|
153 |
+
# End of Chat Interface Tab Functions
|
154 |
+
################################################################################################################################################################################################################################
|
155 |
+
#
|
156 |
+
# Media Edit Tab Functions
|
157 |
+
# Functions:
|
158 |
+
# Fixme
|
159 |
+
# create_media_edit_tab():
|
160 |
+
##### Trash Tab
|
161 |
+
# FIXME
|
162 |
+
# Functions:
|
163 |
+
#
|
164 |
+
# End of Media Edit Tab Functions
|
165 |
+
################################################################################################################
|
166 |
+
#
|
167 |
+
# Import Items Tab Functions
|
168 |
+
#
|
169 |
+
# Functions:
|
170 |
+
#FIXME
|
171 |
+
# End of Import Items Tab Functions
|
172 |
+
################################################################################################################
|
173 |
+
#
|
174 |
+
# Export Items Tab Functions
|
175 |
+
#
|
176 |
+
# Functions:
|
177 |
+
# FIXME
|
178 |
+
#
|
179 |
+
#
|
180 |
+
# End of Export Items Tab Functions
|
181 |
+
################################################################################################################
|
182 |
+
#
|
183 |
+
# Keyword Management Tab Functions
|
184 |
+
#
|
185 |
+
# Functions:
|
186 |
+
# create_view_keywords_tab():
|
187 |
+
# FIXME
|
188 |
+
#
|
189 |
+
# End of Keyword Management Tab Functions
|
190 |
+
################################################################################################################
|
191 |
+
#
|
192 |
+
# Document Editing Tab Functions
|
193 |
+
#
|
194 |
+
# Functions:
|
195 |
+
# #FIXME
|
196 |
+
#
|
197 |
+
#
|
198 |
+
################################################################################################################
|
199 |
+
#
|
200 |
+
# Utilities Tab Functions
|
201 |
+
# Functions:
|
202 |
+
# create_utilities_yt_video_tab():
|
203 |
+
# #FIXME
|
204 |
+
|
205 |
+
#
|
206 |
+
# End of Utilities Tab Functions
|
207 |
+
################################################################################################################
|
208 |
+
|
209 |
+
# FIXME - Prompt sample box
|
210 |
+
#
|
211 |
+
# # Sample data
|
212 |
+
# prompts_category_1 = [
|
213 |
+
# "What are the key points discussed in the video?",
|
214 |
+
# "Summarize the main arguments made by the speaker.",
|
215 |
+
# "Describe the conclusions of the study presented."
|
216 |
+
# ]
|
217 |
+
#
|
218 |
+
# prompts_category_2 = [
|
219 |
+
# "How does the proposed solution address the problem?",
|
220 |
+
# "What are the implications of the findings?",
|
221 |
+
# "Can you explain the theory behind the observed phenomenon?"
|
222 |
+
# ]
|
223 |
+
#
|
224 |
+
# all_prompts2 = prompts_category_1 + prompts_category_2
|
225 |
+
|
226 |
+
|
227 |
+
def launch_ui(share_public=None, server_mode=False):
|
228 |
+
share=share_public
|
229 |
+
css = """
|
230 |
+
.result-box {
|
231 |
+
margin-bottom: 20px;
|
232 |
+
border: 1px solid #ddd;
|
233 |
+
padding: 10px;
|
234 |
+
}
|
235 |
+
.result-box.error {
|
236 |
+
border-color: #ff0000;
|
237 |
+
background-color: #ffeeee;
|
238 |
+
}
|
239 |
+
.transcription, .summary {
|
240 |
+
max-height: 300px;
|
241 |
+
overflow-y: auto;
|
242 |
+
border: 1px solid #eee;
|
243 |
+
padding: 10px;
|
244 |
+
margin-top: 10px;
|
245 |
+
}
|
246 |
+
"""
|
247 |
+
|
248 |
+
with gr.Blocks(theme='bethecloud/storj_theme',css=css) as iface:
|
249 |
+
db_config = get_db_config()
|
250 |
+
db_type = db_config['type']
|
251 |
+
gr.Markdown(f"# tl/dw: Your LLM-powered Research Multi-tool")
|
252 |
+
gr.Markdown(f"(Using {db_type.capitalize()} Database)")
|
253 |
+
with gr.Tabs():
|
254 |
+
with gr.TabItem("Transcription / Summarization / Ingestion"):
|
255 |
+
with gr.Tabs():
|
256 |
+
create_video_transcription_tab()
|
257 |
+
create_audio_processing_tab()
|
258 |
+
create_podcast_tab()
|
259 |
+
create_import_book_tab()
|
260 |
+
create_plain_text_import_tab()
|
261 |
+
create_website_scraping_tab()
|
262 |
+
create_pdf_ingestion_tab()
|
263 |
+
create_pdf_ingestion_test_tab()
|
264 |
+
create_resummary_tab()
|
265 |
+
create_summarize_explain_tab()
|
266 |
+
create_live_recording_tab()
|
267 |
+
|
268 |
+
with gr.TabItem("Text Search "):
|
269 |
+
create_search_tab()
|
270 |
+
create_search_summaries_tab()
|
271 |
+
|
272 |
+
with gr.TabItem("RAG Search"):
|
273 |
+
create_rag_tab()
|
274 |
+
create_rag_qa_chat_tab()
|
275 |
+
|
276 |
+
with gr.TabItem("Chat with an LLM"):
|
277 |
+
create_chat_interface()
|
278 |
+
create_chat_interface_stacked()
|
279 |
+
create_chat_interface_multi_api()
|
280 |
+
create_chat_interface_four()
|
281 |
+
create_chat_with_llamafile_tab()
|
282 |
+
create_chat_management_tab()
|
283 |
+
chat_workflows_tab()
|
284 |
+
create_multiple_character_chat_tab()
|
285 |
+
create_narrator_controlled_conversation_tab()
|
286 |
+
create_character_card_interaction_tab()
|
287 |
+
|
288 |
+
with gr.TabItem("View DB Items"):
|
289 |
+
# This one works
|
290 |
+
create_view_all_with_versions_tab()
|
291 |
+
# This one is WIP
|
292 |
+
create_viewing_tab()
|
293 |
+
create_prompt_view_tab()
|
294 |
+
|
295 |
+
with gr.TabItem("Prompts"):
|
296 |
+
create_prompt_view_tab()
|
297 |
+
create_prompt_search_tab()
|
298 |
+
create_prompt_edit_tab()
|
299 |
+
create_prompt_clone_tab()
|
300 |
+
|
301 |
+
|
302 |
+
with gr.TabItem("Manage / Edit Existing Items"):
|
303 |
+
create_media_edit_tab()
|
304 |
+
create_manage_items_tab()
|
305 |
+
create_media_edit_and_clone_tab()
|
306 |
+
# FIXME
|
307 |
+
#create_compare_transcripts_tab()
|
308 |
+
|
309 |
+
with gr.TabItem("Embeddings Management"):
|
310 |
+
create_embeddings_tab()
|
311 |
+
create_view_embeddings_tab()
|
312 |
+
create_purge_embeddings_tab()
|
313 |
+
|
314 |
+
with gr.TabItem("Writing Tools"):
|
315 |
+
with gr.Tabs():
|
316 |
+
from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
|
317 |
+
create_document_feedback_tab()
|
318 |
+
from App_Function_Libraries.Gradio_UI.Writing_tab import create_grammar_style_check_tab
|
319 |
+
create_grammar_style_check_tab()
|
320 |
+
from App_Function_Libraries.Gradio_UI.Writing_tab import create_tone_adjustment_tab
|
321 |
+
create_tone_adjustment_tab()
|
322 |
+
from App_Function_Libraries.Gradio_UI.Writing_tab import create_creative_writing_tab
|
323 |
+
create_creative_writing_tab()
|
324 |
+
from App_Function_Libraries.Gradio_UI.Writing_tab import create_mikupad_tab
|
325 |
+
create_mikupad_tab()
|
326 |
+
|
327 |
+
|
328 |
+
with gr.TabItem("Keywords"):
|
329 |
+
create_view_keywords_tab()
|
330 |
+
create_add_keyword_tab()
|
331 |
+
create_delete_keyword_tab()
|
332 |
+
create_export_keywords_tab()
|
333 |
+
|
334 |
+
with gr.TabItem("Import"):
|
335 |
+
create_import_item_tab()
|
336 |
+
create_import_obsidian_vault_tab()
|
337 |
+
create_import_single_prompt_tab()
|
338 |
+
create_import_multiple_prompts_tab()
|
339 |
+
create_mediawiki_import_tab()
|
340 |
+
create_mediawiki_config_tab()
|
341 |
+
|
342 |
+
with gr.TabItem("Export"):
|
343 |
+
create_export_tab()
|
344 |
+
|
345 |
+
with gr.TabItem("Backup Management"):
|
346 |
+
create_backup_tab()
|
347 |
+
create_view_backups_tab()
|
348 |
+
create_restore_backup_tab()
|
349 |
+
|
350 |
+
with gr.TabItem("Utilities"):
|
351 |
+
create_utilities_yt_video_tab()
|
352 |
+
create_utilities_yt_audio_tab()
|
353 |
+
create_utilities_yt_timestamp_tab()
|
354 |
+
|
355 |
+
with gr.TabItem("Trashcan"):
|
356 |
+
create_search_and_mark_trash_tab()
|
357 |
+
create_view_trash_tab()
|
358 |
+
create_delete_trash_tab()
|
359 |
+
create_empty_trash_tab()
|
360 |
+
|
361 |
+
with gr.TabItem("Evaluations"):
|
362 |
+
create_geval_tab()
|
363 |
+
create_infinite_bench_tab()
|
364 |
+
create_mmlu_pro_tab()
|
365 |
+
|
366 |
+
with gr.TabItem("Introduction/Help"):
|
367 |
+
create_introduction_tab()
|
368 |
+
|
369 |
+
with gr.TabItem("Config Editor"):
|
370 |
+
create_config_editor_tab()
|
371 |
+
|
372 |
+
# Launch the interface
|
373 |
+
server_port_variable = 7860
|
374 |
+
os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
|
375 |
+
if share==True:
|
376 |
+
iface.launch(share=True)
|
377 |
+
elif server_mode and not share_public:
|
378 |
+
iface.launch(share=False, server_name="0.0.0.0", server_port=server_port_variable, )
|
379 |
+
else:
|
380 |
+
try:
|
381 |
+
iface.launch(share=False, server_name="0.0.0.0", server_port=server_port_variable, )
|
382 |
+
except Exception as e:
|
383 |
+
logging.error(f"Error launching interface: {str(e)}")
|
App_Function_Libraries/MediaWiki/Media_Wiki.py
CHANGED
@@ -23,7 +23,8 @@ from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_conten
|
|
23 |
# Functions:
|
24 |
# Load configuration
|
25 |
def load_mediawiki_import_config():
|
26 |
-
|
|
|
27 |
return yaml.safe_load(f)
|
28 |
|
29 |
config = load_mediawiki_import_config()
|
|
|
23 |
# Functions:
|
24 |
# Load configuration
|
25 |
def load_mediawiki_import_config():
|
26 |
+
config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'Config_Files', 'mediawiki_import_config.yaml')
|
27 |
+
with open(config_path, 'r') as f:
|
28 |
return yaml.safe_load(f)
|
29 |
|
30 |
config = load_mediawiki_import_config()
|
App_Function_Libraries/Plaintext/Plaintext_Files.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Plaintext_Files.py
|
2 |
+
# Description: This file contains functions for reading and writing plaintext files.
|
3 |
+
#
|
4 |
+
# Import necessary libraries
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
from datetime import datetime
|
8 |
+
import logging
|
9 |
+
import tempfile
|
10 |
+
import zipfile
|
11 |
+
#
|
12 |
+
# Non-Local Imports
|
13 |
+
#
|
14 |
+
# Local Imports
|
15 |
+
#
|
16 |
+
#######################################################################################################################
|
17 |
+
#
|
18 |
+
# Function Definitions
|
App_Function_Libraries/RAG/CRAG_Pipeline.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# First gen
|
2 |
+
|
3 |
+
# Install the necessary libraries
|
4 |
+
# !pip install transformers
|
5 |
+
# !pip install sentence-transformers
|
6 |
+
# !pip install torch
|
7 |
+
# !pip install requests
|
8 |
+
# !pip install bs4
|
9 |
+
|
10 |
+
import requests
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
13 |
+
from sentence_transformers import SentenceTransformer, util
|
14 |
+
import torch
|
15 |
+
|
16 |
+
# Step 1: Load Models for Summarization and Similarity
|
17 |
+
model_name = "facebook/bart-large-cnn" # Summarization model
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
19 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
20 |
+
|
21 |
+
# Summarization pipeline
|
22 |
+
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
23 |
+
|
24 |
+
# Sentence similarity model
|
25 |
+
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
26 |
+
|
27 |
+
|
28 |
+
# Step 2: Define Retrieval Evaluator
|
29 |
+
def evaluate_retrieval(query, retrieved_docs):
|
30 |
+
"""
|
31 |
+
Evaluate the relevance of retrieved documents using cosine similarity
|
32 |
+
with sentence embeddings.
|
33 |
+
"""
|
34 |
+
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
35 |
+
doc_embeddings = similarity_model.encode(retrieved_docs, convert_to_tensor=True)
|
36 |
+
|
37 |
+
# Calculate cosine similarity between the query and each document
|
38 |
+
similarities = [util.pytorch_cos_sim(query_embedding, doc_embedding).item() for doc_embedding in doc_embeddings]
|
39 |
+
|
40 |
+
# Set a threshold for relevance (adjustable)
|
41 |
+
relevance_threshold = 0.5
|
42 |
+
relevance_scores = ['Correct' if sim > relevance_threshold else 'Incorrect' for sim in similarities]
|
43 |
+
|
44 |
+
return relevance_scores
|
45 |
+
|
46 |
+
|
47 |
+
# Step 3: Knowledge Refinement (Decompose-then-Recompose)
|
48 |
+
def decompose_then_recompose(retrieved_docs):
|
49 |
+
"""
|
50 |
+
Refine the retrieved documents by summarizing their key information.
|
51 |
+
"""
|
52 |
+
refined_knowledge = []
|
53 |
+
for doc in retrieved_docs:
|
54 |
+
summary = summarizer(doc, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
|
55 |
+
refined_knowledge.append(summary)
|
56 |
+
return refined_knowledge
|
57 |
+
|
58 |
+
|
59 |
+
# Step 4: Web Search for External Knowledge
|
60 |
+
def web_search(query):
|
61 |
+
"""
|
62 |
+
Perform a web search to retrieve additional external knowledge if the
|
63 |
+
retrieved documents are not relevant.
|
64 |
+
"""
|
65 |
+
search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
|
66 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
67 |
+
response = requests.get(search_url, headers=headers)
|
68 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
69 |
+
|
70 |
+
# Extract URLs from search results (simplified)
|
71 |
+
links = []
|
72 |
+
for item in soup.find_all('a'):
|
73 |
+
link = item.get('href')
|
74 |
+
if link and "http" in link:
|
75 |
+
links.append(link)
|
76 |
+
return links[:5] # Return the first 5 URLs
|
77 |
+
|
78 |
+
|
79 |
+
# Step 5: Generate Final Output
|
80 |
+
def generate_final_output(query, refined_knowledge):
|
81 |
+
"""
|
82 |
+
Generate the final output summary using the refined knowledge.
|
83 |
+
"""
|
84 |
+
combined_knowledge = " ".join(refined_knowledge)
|
85 |
+
final_summary = summarizer(combined_knowledge, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
|
86 |
+
return final_summary
|
87 |
+
|
88 |
+
|
89 |
+
# Step 6: CRAG Workflow Integration
|
90 |
+
def crag_workflow(query, retrieved_docs):
|
91 |
+
"""
|
92 |
+
Full CRAG workflow integrating evaluation, knowledge refinement,
|
93 |
+
and web search to generate a robust output summary.
|
94 |
+
"""
|
95 |
+
# Step 1: Evaluate retrieval
|
96 |
+
relevance_scores = evaluate_retrieval(query, retrieved_docs)
|
97 |
+
|
98 |
+
if 'Correct' in relevance_scores:
|
99 |
+
# Step 2: Decompose-then-Recompose for correct documents
|
100 |
+
refined_knowledge = decompose_then_recompose(
|
101 |
+
[doc for doc, score in zip(retrieved_docs, relevance_scores) if score == 'Correct'])
|
102 |
+
else:
|
103 |
+
# Step 3: Web search if retrieval is incorrect
|
104 |
+
web_results = web_search(query)
|
105 |
+
refined_knowledge = decompose_then_recompose(web_results)
|
106 |
+
|
107 |
+
# Step 4: Generate final output
|
108 |
+
final_summary = generate_final_output(query, refined_knowledge)
|
109 |
+
|
110 |
+
return final_summary
|
111 |
+
|
112 |
+
|
113 |
+
# Example Usage
|
114 |
+
if __name__ == "__main__":
|
115 |
+
# Example query and retrieved documents
|
116 |
+
query = "What are the latest advancements in renewable energy?"
|
117 |
+
retrieved_docs = [
|
118 |
+
"Renewable energy is becoming increasingly important in today's world...",
|
119 |
+
"Solar energy has seen significant advancements in the past decade...",
|
120 |
+
"Wind energy technology is rapidly evolving, with new innovations expected soon..."
|
121 |
+
]
|
122 |
+
|
123 |
+
# Perform the CRAG workflow
|
124 |
+
final_summary = crag_workflow(query, retrieved_docs)
|
125 |
+
print("Final Summary:", final_summary)
|
App_Function_Libraries/RAG/ChromaDB_Library.py
CHANGED
@@ -1,290 +1,244 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
import
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
from
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
#
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
""
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
def
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
)
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
return
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
collection.add(
|
245 |
-
documents=texts,
|
246 |
-
embeddings=embeddings,
|
247 |
-
ids=ids,
|
248 |
-
metadatas=[{'source': source} for source in sources]
|
249 |
-
)
|
250 |
-
|
251 |
-
|
252 |
-
def check_embedding_status(selected_item):
|
253 |
-
if not selected_item:
|
254 |
-
return "Please select an item", ""
|
255 |
-
item_id = selected_item.split('(')[0].strip()
|
256 |
-
collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
|
257 |
-
result = collection.get(ids=[f"doc_{item_id}"])
|
258 |
-
if result['ids']:
|
259 |
-
embedding = result['embeddings'][0]
|
260 |
-
embedding_preview = str(embedding[:50]) # Convert first 50 elements to string
|
261 |
-
return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
|
262 |
-
else:
|
263 |
-
return f"No embedding found for item: {item_id}", ""
|
264 |
-
|
265 |
-
|
266 |
-
def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
|
267 |
-
if not selected_item:
|
268 |
-
return "Please select an item"
|
269 |
-
item_id = selected_item.split('(')[0].strip()
|
270 |
-
items = get_all_content_from_database()
|
271 |
-
item = next((item for item in items if item['title'] == item_id), None)
|
272 |
-
if not item:
|
273 |
-
return f"Item not found: {item_id}"
|
274 |
-
|
275 |
-
try:
|
276 |
-
if api_choice == "OpenAI":
|
277 |
-
embedding = create_embedding(item['content'])
|
278 |
-
else: # Llama.cpp
|
279 |
-
embedding = create_embedding(item['content'])
|
280 |
-
|
281 |
-
collection_name = "all_content_embeddings"
|
282 |
-
store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
|
283 |
-
return f"New embedding created and stored for item: {item_id}"
|
284 |
-
except Exception as e:
|
285 |
-
return f"Error creating embedding: {str(e)}"
|
286 |
-
|
287 |
-
|
288 |
-
#
|
289 |
-
# End of Functions for ChromaDB
|
290 |
#######################################################################################################################
|
|
|
1 |
+
# ChromaDB_Library.py
|
2 |
+
# Description: Functions for managing embeddings in ChromaDB
|
3 |
+
#
|
4 |
+
# Imports:
|
5 |
+
import logging
|
6 |
+
from typing import List, Dict, Any
|
7 |
+
# 3rd-Party Imports:
|
8 |
+
import chromadb
|
9 |
+
from chromadb import Settings
|
10 |
+
from itertools import islice
|
11 |
+
#
|
12 |
+
# Local Imports:
|
13 |
+
from App_Function_Libraries.Chunk_Lib import chunk_for_embedding, chunk_options
|
14 |
+
from App_Function_Libraries.DB.SQLite_DB import process_chunks
|
15 |
+
from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
|
16 |
+
# FIXME - related to Chunking
|
17 |
+
from App_Function_Libraries.RAG.Embeddings_Create import create_embedding
|
18 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize
|
19 |
+
from App_Function_Libraries.Utils.Utils import get_database_path, ensure_directory_exists, \
|
20 |
+
load_comprehensive_config
|
21 |
+
#
|
22 |
+
#######################################################################################################################
|
23 |
+
#
|
24 |
+
# Config Settings for ChromaDB Functions
|
25 |
+
#
|
26 |
+
# FIXME - Refactor so that all globals are set in summarize.py
|
27 |
+
# Set up logging
|
28 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
#
|
31 |
+
# Load config
|
32 |
+
config = load_comprehensive_config()
|
33 |
+
#
|
34 |
+
# ChromaDB settings
|
35 |
+
chroma_db_path = config.get('Database', 'chroma_db_path', fallback=get_database_path('chroma_db'))
|
36 |
+
ensure_directory_exists(chroma_db_path)
|
37 |
+
chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
|
38 |
+
#
|
39 |
+
# Embedding settings
|
40 |
+
embedding_provider = config.get('Embeddings', 'embedding_provider', fallback='openai')
|
41 |
+
embedding_model = config.get('Embeddings', 'embedding_model', fallback='text-embedding-3-small')
|
42 |
+
embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
|
43 |
+
embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
|
44 |
+
#
|
45 |
+
# End of Config Settings
|
46 |
+
#######################################################################################################################
|
47 |
+
#
|
48 |
+
# Functions:
|
49 |
+
|
50 |
+
def batched(iterable, n):
|
51 |
+
"Batch data into lists of length n. The last batch may be shorter."
|
52 |
+
it = iter(iterable)
|
53 |
+
while True:
|
54 |
+
batch = list(islice(it, n))
|
55 |
+
if not batch:
|
56 |
+
return
|
57 |
+
yield batch
|
58 |
+
|
59 |
+
|
60 |
+
# FIXME - Fix summarization of entire document/storign in chunk issue
|
61 |
+
# FIXME - update all uses to reflect 'api_name' parameter
|
62 |
+
def process_and_store_content(database, content: str, collection_name: str, media_id: int, file_name: str,
|
63 |
+
create_embeddings: bool = False, create_summary: bool = False, api_name: str = None,
|
64 |
+
chunk_options: Dict = None, embedding_provider: str = None,
|
65 |
+
embedding_model: str = None, embedding_api_url: str = None):
|
66 |
+
try:
|
67 |
+
logger.info(f"Processing content for media_id {media_id} in collection {collection_name}")
|
68 |
+
|
69 |
+
full_summary = None
|
70 |
+
if create_summary and api_name:
|
71 |
+
full_summary = summarize(content, None, api_name, None, None, None)
|
72 |
+
|
73 |
+
chunks = chunk_for_embedding(content, file_name, full_summary, chunk_options)
|
74 |
+
|
75 |
+
# Process chunks synchronously
|
76 |
+
process_chunks(database, chunks, media_id)
|
77 |
+
|
78 |
+
if create_embeddings:
|
79 |
+
texts = [chunk['text'] for chunk in chunks]
|
80 |
+
embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
|
81 |
+
ids = [f"{media_id}_chunk_{i}" for i in range(1, len(chunks) + 1)]
|
82 |
+
metadatas = [{
|
83 |
+
"media_id": str(media_id),
|
84 |
+
"chunk_index": i,
|
85 |
+
"total_chunks": len(chunks),
|
86 |
+
"start_index": int(chunk['metadata']['start_index']),
|
87 |
+
"end_index": int(chunk['metadata']['end_index']),
|
88 |
+
"file_name": str(file_name),
|
89 |
+
"relative_position": float(chunk['metadata']['relative_position'])
|
90 |
+
} for i, chunk in enumerate(chunks, 1)]
|
91 |
+
|
92 |
+
store_in_chroma(collection_name, texts, embeddings, ids, metadatas)
|
93 |
+
|
94 |
+
# Update full-text search index
|
95 |
+
database.execute_query(
|
96 |
+
"INSERT OR REPLACE INTO media_fts (rowid, title, content) SELECT id, title, content FROM Media WHERE id = ?",
|
97 |
+
(media_id,)
|
98 |
+
)
|
99 |
+
|
100 |
+
logger.info(f"Finished processing and storing content for media_id {media_id}")
|
101 |
+
|
102 |
+
except Exception as e:
|
103 |
+
logger.error(f"Error in process_and_store_content for media_id {media_id}: {str(e)}")
|
104 |
+
raise
|
105 |
+
|
106 |
+
# Usage example:
|
107 |
+
# process_and_store_content(db, content, "my_collection", 1, "example.txt", create_embeddings=True, create_summary=True, api_name="gpt-3.5-turbo")
|
108 |
+
|
109 |
+
|
110 |
+
def check_embedding_status(selected_item, item_mapping):
|
111 |
+
if not selected_item:
|
112 |
+
return "Please select an item", ""
|
113 |
+
|
114 |
+
try:
|
115 |
+
item_id = item_mapping.get(selected_item)
|
116 |
+
if item_id is None:
|
117 |
+
return f"Invalid item selected: {selected_item}", ""
|
118 |
+
|
119 |
+
item_title = selected_item.rsplit(' (', 1)[0]
|
120 |
+
collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
|
121 |
+
|
122 |
+
result = collection.get(ids=[f"doc_{item_id}"], include=["embeddings", "metadatas"])
|
123 |
+
logging.info(f"ChromaDB result for item '{item_title}' (ID: {item_id}): {result}")
|
124 |
+
|
125 |
+
if not result['ids']:
|
126 |
+
return f"No embedding found for item '{item_title}' (ID: {item_id})", ""
|
127 |
+
|
128 |
+
if not result['embeddings'] or not result['embeddings'][0]:
|
129 |
+
return f"Embedding data missing for item '{item_title}' (ID: {item_id})", ""
|
130 |
+
|
131 |
+
embedding = result['embeddings'][0]
|
132 |
+
metadata = result['metadatas'][0] if result['metadatas'] else {}
|
133 |
+
embedding_preview = str(embedding[:50])
|
134 |
+
status = f"Embedding exists for item '{item_title}' (ID: {item_id})"
|
135 |
+
return status, f"First 50 elements of embedding:\n{embedding_preview}\n\nMetadata: {metadata}"
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
logging.error(f"Error in check_embedding_status: {str(e)}")
|
139 |
+
return f"Error processing item: {selected_item}. Details: {str(e)}", ""
|
140 |
+
|
141 |
+
def reset_chroma_collection(collection_name: str):
|
142 |
+
try:
|
143 |
+
chroma_client.delete_collection(collection_name)
|
144 |
+
chroma_client.create_collection(collection_name)
|
145 |
+
logging.info(f"Reset ChromaDB collection: {collection_name}")
|
146 |
+
except Exception as e:
|
147 |
+
logging.error(f"Error resetting ChromaDB collection: {str(e)}")
|
148 |
+
|
149 |
+
|
150 |
+
def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], metadatas: List[Dict[str, Any]]):
|
151 |
+
try:
|
152 |
+
collection = chroma_client.get_or_create_collection(name=collection_name)
|
153 |
+
|
154 |
+
# Log the inputs for debugging
|
155 |
+
logging.debug(f"Storing in ChromaDB - Collection: {collection_name}")
|
156 |
+
logging.debug(f"Texts (first 100 chars): {texts[0][:100]}...")
|
157 |
+
logging.debug(f"Embeddings (first 5 values): {embeddings[0][:5]}")
|
158 |
+
logging.debug(f"IDs: {ids}")
|
159 |
+
logging.debug(f"Metadatas: {metadatas}")
|
160 |
+
|
161 |
+
# Use upsert instead of add/update
|
162 |
+
collection.upsert(
|
163 |
+
documents=texts,
|
164 |
+
embeddings=embeddings,
|
165 |
+
ids=ids,
|
166 |
+
metadatas=metadatas
|
167 |
+
)
|
168 |
+
|
169 |
+
# Verify storage
|
170 |
+
for doc_id in ids:
|
171 |
+
result = collection.get(ids=[doc_id], include=["embeddings"])
|
172 |
+
if not result['embeddings'] or result['embeddings'][0] is None:
|
173 |
+
logging.error(f"Failed to store embedding for {doc_id}")
|
174 |
+
else:
|
175 |
+
logging.info(f"Embedding stored successfully for {doc_id}")
|
176 |
+
|
177 |
+
except Exception as e:
|
178 |
+
logging.error(f"Error storing embeddings in ChromaDB: {str(e)}")
|
179 |
+
raise
|
180 |
+
|
181 |
+
|
182 |
+
# Function to perform vector search using ChromaDB + Keywords from the media_db
|
183 |
+
def vector_search(collection_name: str, query: str, k: int = 10) -> List[Dict[str, Any]]:
|
184 |
+
try:
|
185 |
+
query_embedding = create_embedding(query, embedding_provider, embedding_model, embedding_api_url)
|
186 |
+
collection = chroma_client.get_collection(name=collection_name)
|
187 |
+
results = collection.query(
|
188 |
+
query_embeddings=[query_embedding],
|
189 |
+
n_results=k,
|
190 |
+
include=["documents", "metadatas"]
|
191 |
+
)
|
192 |
+
return [{"content": doc, "metadata": meta} for doc, meta in zip(results['documents'][0], results['metadatas'][0])]
|
193 |
+
except Exception as e:
|
194 |
+
logging.error(f"Error in vector_search: {str(e)}")
|
195 |
+
raise
|
196 |
+
|
197 |
+
def schedule_embedding(media_id: int, content: str, media_name: str, summary: str):
|
198 |
+
try:
|
199 |
+
chunks = chunk_for_embedding(content, media_name, summary, chunk_options)
|
200 |
+
texts = [chunk['text'] for chunk in chunks]
|
201 |
+
embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
|
202 |
+
ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]
|
203 |
+
metadatas = [{
|
204 |
+
"media_id": str(media_id),
|
205 |
+
"chunk_index": i,
|
206 |
+
"total_chunks": len(chunks),
|
207 |
+
"start_index": chunk['metadata']['start_index'],
|
208 |
+
"end_index": chunk['metadata']['end_index'],
|
209 |
+
"file_name": media_name,
|
210 |
+
"relative_position": chunk['metadata']['relative_position']
|
211 |
+
} for i, chunk in enumerate(chunks)]
|
212 |
+
|
213 |
+
store_in_chroma("all_content_embeddings", texts, embeddings, ids, metadatas)
|
214 |
+
|
215 |
+
except Exception as e:
|
216 |
+
logging.error(f"Error scheduling embedding for media_id {media_id}: {str(e)}")
|
217 |
+
|
218 |
+
|
219 |
+
# Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
|
220 |
+
# def process_and_store_content(content: str, collection_name: str, media_id: int):
|
221 |
+
# # Process the content into chunks
|
222 |
+
# chunks = improved_chunking_process(content, chunk_options)
|
223 |
+
# texts = [chunk['text'] for chunk in chunks]
|
224 |
+
#
|
225 |
+
# # Generate embeddings for each chunk
|
226 |
+
# embeddings = [create_embedding(text) for text in texts]
|
227 |
+
#
|
228 |
+
# # Create unique IDs for each chunk using the media_id and chunk index
|
229 |
+
# ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
|
230 |
+
#
|
231 |
+
# # Store the texts, embeddings, and IDs in ChromaDB
|
232 |
+
# store_in_chroma(collection_name, texts, embeddings, ids)
|
233 |
+
#
|
234 |
+
# # Store the chunk metadata in SQLite
|
235 |
+
# for i, chunk in enumerate(chunks):
|
236 |
+
# add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
|
237 |
+
#
|
238 |
+
# # Update the FTS table
|
239 |
+
# update_fts_for_media(media_id)
|
240 |
+
|
241 |
+
|
242 |
+
#
|
243 |
+
# End of Functions for ChromaDB
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
#######################################################################################################################
|
App_Function_Libraries/RAG/Embeddings_Create.py
CHANGED
@@ -1,168 +1,224 @@
|
|
1 |
-
# Embeddings_Create.py
|
2 |
-
# Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
|
3 |
-
#
|
4 |
-
# Imports:
|
5 |
-
import logging
|
6 |
-
|
7 |
-
|
8 |
-
import
|
9 |
-
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
-
import
|
14 |
-
|
15 |
-
#
|
16 |
-
|
17 |
-
from App_Function_Libraries.
|
18 |
-
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
19 |
-
|
20 |
-
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
#
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
if
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
#
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Embeddings_Create.py
|
2 |
+
# Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
|
3 |
+
#
|
4 |
+
# Imports:
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from functools import wraps
|
8 |
+
from threading import Lock, Timer
|
9 |
+
from typing import List
|
10 |
+
#
|
11 |
+
# 3rd-Party Imports:
|
12 |
+
import requests
|
13 |
+
from transformers import AutoTokenizer, AutoModel
|
14 |
+
import torch
|
15 |
+
#
|
16 |
+
# Local Imports:
|
17 |
+
from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
|
18 |
+
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
19 |
+
#
|
20 |
+
#######################################################################################################################
|
21 |
+
#
|
22 |
+
# Functions:
|
23 |
+
|
24 |
+
# FIXME - Add all globals to summarize.py
|
25 |
+
loaded_config = load_comprehensive_config()
|
26 |
+
embedding_provider = loaded_config['Embeddings']['embedding_provider']
|
27 |
+
embedding_model = loaded_config['Embeddings']['embedding_model']
|
28 |
+
embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
|
29 |
+
embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
|
30 |
+
|
31 |
+
# Embedding Chunking Settings
|
32 |
+
chunk_size = loaded_config['Embeddings']['chunk_size']
|
33 |
+
overlap = loaded_config['Embeddings']['overlap']
|
34 |
+
|
35 |
+
|
36 |
+
# FIXME - Add logging
|
37 |
+
|
38 |
+
|
39 |
+
class HuggingFaceEmbedder:
|
40 |
+
def __init__(self, model_name, timeout_seconds=120): # Default timeout of 2 minutes
|
41 |
+
self.model_name = model_name
|
42 |
+
self.tokenizer = None
|
43 |
+
self.model = None
|
44 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
45 |
+
self.timeout_seconds = timeout_seconds
|
46 |
+
self.last_used_time = 0
|
47 |
+
self.unload_timer = None
|
48 |
+
|
49 |
+
def load_model(self):
|
50 |
+
if self.model is None:
|
51 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
52 |
+
self.model = AutoModel.from_pretrained(self.model_name)
|
53 |
+
self.model.to(self.device)
|
54 |
+
self.last_used_time = time.time()
|
55 |
+
self.reset_timer()
|
56 |
+
|
57 |
+
def unload_model(self):
|
58 |
+
if self.model is not None:
|
59 |
+
del self.model
|
60 |
+
del self.tokenizer
|
61 |
+
if torch.cuda.is_available():
|
62 |
+
torch.cuda.empty_cache()
|
63 |
+
self.model = None
|
64 |
+
self.tokenizer = None
|
65 |
+
if self.unload_timer:
|
66 |
+
self.unload_timer.cancel()
|
67 |
+
|
68 |
+
def reset_timer(self):
|
69 |
+
if self.unload_timer:
|
70 |
+
self.unload_timer.cancel()
|
71 |
+
self.unload_timer = Timer(self.timeout_seconds, self.unload_model)
|
72 |
+
self.unload_timer.start()
|
73 |
+
|
74 |
+
def create_embeddings(self, texts):
|
75 |
+
self.load_model()
|
76 |
+
inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
77 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
78 |
+
with torch.no_grad():
|
79 |
+
outputs = self.model(**inputs)
|
80 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
81 |
+
return embeddings.cpu().numpy()
|
82 |
+
|
83 |
+
# Global variable to hold the embedder
|
84 |
+
huggingface_embedder = None
|
85 |
+
|
86 |
+
|
87 |
+
class RateLimiter:
|
88 |
+
def __init__(self, max_calls, period):
|
89 |
+
self.max_calls = max_calls
|
90 |
+
self.period = period
|
91 |
+
self.calls = []
|
92 |
+
self.lock = Lock()
|
93 |
+
|
94 |
+
def __call__(self, func):
|
95 |
+
def wrapper(*args, **kwargs):
|
96 |
+
with self.lock:
|
97 |
+
now = time.time()
|
98 |
+
self.calls = [call for call in self.calls if call > now - self.period]
|
99 |
+
if len(self.calls) >= self.max_calls:
|
100 |
+
sleep_time = self.calls[0] - (now - self.period)
|
101 |
+
time.sleep(sleep_time)
|
102 |
+
self.calls.append(time.time())
|
103 |
+
return func(*args, **kwargs)
|
104 |
+
return wrapper
|
105 |
+
|
106 |
+
|
107 |
+
def exponential_backoff(max_retries=5, base_delay=1):
|
108 |
+
def decorator(func):
|
109 |
+
@wraps(func)
|
110 |
+
def wrapper(*args, **kwargs):
|
111 |
+
for attempt in range(max_retries):
|
112 |
+
try:
|
113 |
+
return func(*args, **kwargs)
|
114 |
+
except Exception as e:
|
115 |
+
if attempt == max_retries - 1:
|
116 |
+
raise
|
117 |
+
delay = base_delay * (2 ** attempt)
|
118 |
+
logging.warning(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds. Error: {str(e)}")
|
119 |
+
time.sleep(delay)
|
120 |
+
return wrapper
|
121 |
+
return decorator
|
122 |
+
|
123 |
+
|
124 |
+
# FIXME - refactor/setup to use config file & perform chunking
|
125 |
+
@exponential_backoff()
|
126 |
+
@RateLimiter(max_calls=50, period=60) # Adjust these values based on API limits
|
127 |
+
def create_embeddings_batch(texts: List[str], provider: str, model: str, api_url: str, timeout_seconds: int = 300) -> \
|
128 |
+
List[List[float]]:
|
129 |
+
global huggingface_embedder
|
130 |
+
|
131 |
+
if provider.lower() == 'huggingface':
|
132 |
+
if huggingface_embedder is None or huggingface_embedder.model_name != model:
|
133 |
+
if huggingface_embedder is not None:
|
134 |
+
huggingface_embedder.unload_model()
|
135 |
+
huggingface_embedder = HuggingFaceEmbedder(model, timeout_seconds)
|
136 |
+
|
137 |
+
embeddings = huggingface_embedder.create_embeddings(texts).tolist()
|
138 |
+
return embeddings
|
139 |
+
|
140 |
+
elif provider.lower() == 'openai':
|
141 |
+
logging.debug(f"Creating embeddings for {len(texts)} texts using OpenAI API")
|
142 |
+
return [create_openai_embedding(text, model) for text in texts]
|
143 |
+
|
144 |
+
elif provider.lower() == 'local':
|
145 |
+
response = requests.post(
|
146 |
+
api_url,
|
147 |
+
json={"texts": texts, "model": model},
|
148 |
+
headers={"Authorization": f"Bearer {embedding_api_key}"}
|
149 |
+
)
|
150 |
+
if response.status_code == 200:
|
151 |
+
return response.json()['embeddings']
|
152 |
+
else:
|
153 |
+
raise Exception(f"Error from local API: {response.text}")
|
154 |
+
else:
|
155 |
+
raise ValueError(f"Unsupported embedding provider: {provider}")
|
156 |
+
|
157 |
+
def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
|
158 |
+
return create_embeddings_batch([text], provider, model, api_url)[0]
|
159 |
+
|
160 |
+
# FIXME
|
161 |
+
def create_stella_embeddings(text: str) -> List[float]:
|
162 |
+
if embedding_provider == 'local':
|
163 |
+
# Load the model and tokenizer
|
164 |
+
tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
|
165 |
+
model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
|
166 |
+
|
167 |
+
# Tokenize and encode the text
|
168 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
169 |
+
|
170 |
+
# Generate embeddings
|
171 |
+
with torch.no_grad():
|
172 |
+
outputs = model(**inputs)
|
173 |
+
|
174 |
+
# Use the mean of the last hidden state as the sentence embedding
|
175 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
176 |
+
|
177 |
+
return embeddings[0].tolist() # Convert to list for consistency
|
178 |
+
elif embedding_provider == 'openai':
|
179 |
+
return get_openai_embeddings(text, embedding_model)
|
180 |
+
else:
|
181 |
+
raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
|
182 |
+
|
183 |
+
|
184 |
+
def create_openai_embedding(text: str, model: str) -> List[float]:
|
185 |
+
embedding = get_openai_embeddings(text, model)
|
186 |
+
return embedding
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
#Dead
|
192 |
+
# def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
|
193 |
+
# response = requests.post(
|
194 |
+
# api_url,
|
195 |
+
# json={"text": text, "model": model},
|
196 |
+
# headers={"Authorization": f"Bearer {api_key}"}
|
197 |
+
# )
|
198 |
+
# response.raise_for_status()
|
199 |
+
# return response.json().get('embedding', None)
|
200 |
+
|
201 |
+
# Dead
|
202 |
+
# def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
|
203 |
+
# response = requests.post(
|
204 |
+
# api_url,
|
205 |
+
# json={"input": text}
|
206 |
+
# )
|
207 |
+
# response.raise_for_status()
|
208 |
+
# return response.json()['embedding']
|
209 |
+
|
210 |
+
# dead
|
211 |
+
# def create_huggingface_embedding(text: str, model: str) -> List[float]:
|
212 |
+
# tokenizer = AutoTokenizer.from_pretrained(model)
|
213 |
+
# model = AutoModel.from_pretrained(model)
|
214 |
+
#
|
215 |
+
# inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
216 |
+
# with torch.no_grad():
|
217 |
+
# outputs = model(**inputs)
|
218 |
+
#
|
219 |
+
# embeddings = outputs.last_hidden_state.mean(dim=1)
|
220 |
+
# return embeddings[0].tolist()
|
221 |
+
|
222 |
+
#
|
223 |
+
# End of File.
|
224 |
+
#######################################################################################################################
|
App_Function_Libraries/RAG/RAG_Libary_2.py
CHANGED
@@ -1,210 +1,340 @@
|
|
1 |
-
# RAG_Library_2.py
|
2 |
-
# Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline.
|
3 |
-
#
|
4 |
-
# Import necessary modules and functions
|
5 |
-
import configparser
|
6 |
-
import logging
|
7 |
-
import os
|
8 |
-
from typing import Dict, Any, List, Optional
|
9 |
-
# Local Imports
|
10 |
-
|
11 |
-
from App_Function_Libraries.Article_Extractor_Lib import scrape_article
|
12 |
-
from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
|
13 |
-
fetch_keywords_for_media
|
14 |
-
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
15 |
-
#
|
16 |
-
# 3rd-Party Imports
|
17 |
-
import openai
|
18 |
-
#
|
19 |
-
########################################################################################################################
|
20 |
-
#
|
21 |
-
# Functions:
|
22 |
-
|
23 |
-
# Initialize OpenAI client (adjust this based on your API key management)
|
24 |
-
openai.api_key = "your-openai-api-key"
|
25 |
-
|
26 |
-
# Get the directory of the current script
|
27 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
28 |
-
# Construct the path to the config file
|
29 |
-
config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
|
30 |
-
# Read the config file
|
31 |
-
config = configparser.ConfigParser()
|
32 |
-
# Read the configuration file
|
33 |
-
config.read('config.txt')
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
#
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
}
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
"
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
{
|
156 |
-
|
157 |
-
"
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# RAG_Library_2.py
|
2 |
+
# Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline.
|
3 |
+
#
|
4 |
+
# Import necessary modules and functions
|
5 |
+
import configparser
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
from typing import Dict, Any, List, Optional
|
9 |
+
# Local Imports
|
10 |
+
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
|
11 |
+
from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
|
12 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
|
13 |
+
fetch_keywords_for_media
|
14 |
+
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
15 |
+
#
|
16 |
+
# 3rd-Party Imports
|
17 |
+
import openai
|
18 |
+
#
|
19 |
+
########################################################################################################################
|
20 |
+
#
|
21 |
+
# Functions:
|
22 |
+
|
23 |
+
# Initialize OpenAI client (adjust this based on your API key management)
|
24 |
+
openai.api_key = "your-openai-api-key"
|
25 |
+
|
26 |
+
# Get the directory of the current script
|
27 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
28 |
+
# Construct the path to the config file
|
29 |
+
config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
|
30 |
+
# Read the config file
|
31 |
+
config = configparser.ConfigParser()
|
32 |
+
# Read the configuration file
|
33 |
+
config.read('config.txt')
|
34 |
+
|
35 |
+
# Main RAG pipeline function
|
36 |
+
def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
|
37 |
+
try:
|
38 |
+
# Extract content
|
39 |
+
try:
|
40 |
+
article_data = scrape_article(url)
|
41 |
+
content = article_data['content']
|
42 |
+
title = article_data['title']
|
43 |
+
except Exception as e:
|
44 |
+
logging.error(f"Error scraping article: {str(e)}")
|
45 |
+
return {"error": "Failed to scrape article", "details": str(e)}
|
46 |
+
|
47 |
+
# Store the article in the database and get the media_id
|
48 |
+
try:
|
49 |
+
media_id = add_media_to_database(url, title, 'article', content)
|
50 |
+
except Exception as e:
|
51 |
+
logging.error(f"Error adding article to database: {str(e)}")
|
52 |
+
return {"error": "Failed to store article in database", "details": str(e)}
|
53 |
+
|
54 |
+
# Process and store content
|
55 |
+
collection_name = f"article_{media_id}"
|
56 |
+
try:
|
57 |
+
# FIXME
|
58 |
+
# def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
|
59 |
+
# create_embeddings: bool = False, create_summary: bool = False,
|
60 |
+
# api_name: str = None):
|
61 |
+
process_and_store_content(content, collection_name, media_id, title)
|
62 |
+
except Exception as e:
|
63 |
+
logging.error(f"Error processing and storing content: {str(e)}")
|
64 |
+
return {"error": "Failed to process and store content", "details": str(e)}
|
65 |
+
|
66 |
+
# Perform searches
|
67 |
+
try:
|
68 |
+
vector_results = vector_search(collection_name, query, k=5)
|
69 |
+
fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
|
70 |
+
except Exception as e:
|
71 |
+
logging.error(f"Error performing searches: {str(e)}")
|
72 |
+
return {"error": "Failed to perform searches", "details": str(e)}
|
73 |
+
|
74 |
+
# Combine results with error handling for missing 'content' key
|
75 |
+
all_results = []
|
76 |
+
for result in vector_results + fts_results:
|
77 |
+
if isinstance(result, dict) and 'content' in result:
|
78 |
+
all_results.append(result['content'])
|
79 |
+
else:
|
80 |
+
logging.warning(f"Unexpected result format: {result}")
|
81 |
+
all_results.append(str(result))
|
82 |
+
|
83 |
+
context = "\n".join(all_results)
|
84 |
+
|
85 |
+
# Generate answer using the selected API
|
86 |
+
try:
|
87 |
+
answer = generate_answer(api_choice, context, query)
|
88 |
+
except Exception as e:
|
89 |
+
logging.error(f"Error generating answer: {str(e)}")
|
90 |
+
return {"error": "Failed to generate answer", "details": str(e)}
|
91 |
+
|
92 |
+
return {
|
93 |
+
"answer": answer,
|
94 |
+
"context": context
|
95 |
+
}
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
|
99 |
+
return {"error": "An unexpected error occurred", "details": str(e)}
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
# RAG Search with keyword filtering
|
104 |
+
def enhanced_rag_pipeline(query: str, api_choice: str, keywords: str = None) -> Dict[str, Any]:
|
105 |
+
try:
|
106 |
+
# Load embedding provider from config, or fallback to 'openai'
|
107 |
+
embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
|
108 |
+
|
109 |
+
# Log the provider used
|
110 |
+
logging.debug(f"Using embedding provider: {embedding_provider}")
|
111 |
+
|
112 |
+
# Process keywords if provided
|
113 |
+
keyword_list = [k.strip().lower() for k in keywords.split(',')] if keywords else []
|
114 |
+
logging.debug(f"enhanced_rag_pipeline - Keywords: {keyword_list}")
|
115 |
+
|
116 |
+
# Fetch relevant media IDs based on keywords if keywords are provided
|
117 |
+
relevant_media_ids = fetch_relevant_media_ids(keyword_list) if keyword_list else None
|
118 |
+
logging.debug(f"enhanced_rag_pipeline - relevant media IDs: {relevant_media_ids}")
|
119 |
+
|
120 |
+
# Perform vector search
|
121 |
+
vector_results = perform_vector_search(query, relevant_media_ids)
|
122 |
+
logging.debug(f"enhanced_rag_pipeline - Vector search results: {vector_results}")
|
123 |
+
|
124 |
+
# Perform full-text search
|
125 |
+
fts_results = perform_full_text_search(query, relevant_media_ids)
|
126 |
+
logging.debug(f"enhanced_rag_pipeline - Full-text search results: {fts_results}")
|
127 |
+
|
128 |
+
# Combine results
|
129 |
+
all_results = vector_results + fts_results
|
130 |
+
# FIXME
|
131 |
+
if not all_results:
|
132 |
+
logging.info(f"No results found. Query: {query}, Keywords: {keywords}")
|
133 |
+
return {
|
134 |
+
"answer": "I couldn't find any relevant information based on your query and keywords.",
|
135 |
+
"context": ""
|
136 |
+
}
|
137 |
+
|
138 |
+
# FIXME - Apply Re-Ranking of results here
|
139 |
+
apply_re_ranking = False
|
140 |
+
if apply_re_ranking:
|
141 |
+
# Implement re-ranking logic here
|
142 |
+
pass
|
143 |
+
# Extract content from results
|
144 |
+
context = "\n".join([result['content'] for result in all_results[:10]]) # Limit to top 10 results
|
145 |
+
logging.debug(f"Context length: {len(context)}")
|
146 |
+
logging.debug(f"Context: {context[:200]}")
|
147 |
+
# Generate answer using the selected API
|
148 |
+
answer = generate_answer(api_choice, context, query)
|
149 |
+
|
150 |
+
return {
|
151 |
+
"answer": answer,
|
152 |
+
"context": context
|
153 |
+
}
|
154 |
+
except Exception as e:
|
155 |
+
logging.error(f"Error in enhanced_rag_pipeline: {str(e)}")
|
156 |
+
return {
|
157 |
+
"answer": "An error occurred while processing your request.",
|
158 |
+
"context": ""
|
159 |
+
}
|
160 |
+
|
161 |
+
|
162 |
+
def generate_answer(api_choice: str, context: str, query: str) -> str:
|
163 |
+
logging.debug("Entering generate_answer function")
|
164 |
+
config = load_comprehensive_config()
|
165 |
+
logging.debug(f"Config sections: {config.sections()}")
|
166 |
+
prompt = f"Context: {context}\n\nQuestion: {query}"
|
167 |
+
if api_choice == "OpenAI":
|
168 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai
|
169 |
+
return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
|
170 |
+
elif api_choice == "Anthropic":
|
171 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_anthropic
|
172 |
+
return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
|
173 |
+
elif api_choice == "Cohere":
|
174 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_cohere
|
175 |
+
return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
|
176 |
+
elif api_choice == "Groq":
|
177 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_groq
|
178 |
+
return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
|
179 |
+
elif api_choice == "OpenRouter":
|
180 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openrouter
|
181 |
+
return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
|
182 |
+
elif api_choice == "HuggingFace":
|
183 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_huggingface
|
184 |
+
return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
|
185 |
+
elif api_choice == "DeepSeek":
|
186 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_deepseek
|
187 |
+
return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
|
188 |
+
elif api_choice == "Mistral":
|
189 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_mistral
|
190 |
+
return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
|
191 |
+
elif api_choice == "Local-LLM":
|
192 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_local_llm
|
193 |
+
return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
|
194 |
+
elif api_choice == "Llama.cpp":
|
195 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama
|
196 |
+
return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
|
197 |
+
elif api_choice == "Kobold":
|
198 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_kobold
|
199 |
+
return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
|
200 |
+
elif api_choice == "Ooba":
|
201 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_oobabooga
|
202 |
+
return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
|
203 |
+
elif api_choice == "TabbyAPI":
|
204 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_tabbyapi
|
205 |
+
return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
|
206 |
+
elif api_choice == "vLLM":
|
207 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_vllm
|
208 |
+
return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
|
209 |
+
elif api_choice == "ollama":
|
210 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_ollama
|
211 |
+
return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
|
212 |
+
else:
|
213 |
+
raise ValueError(f"Unsupported API choice: {api_choice}")
|
214 |
+
|
215 |
+
# Function to preprocess and store all existing content in the database
|
216 |
+
def preprocess_all_content():
|
217 |
+
unprocessed_media = get_unprocessed_media()
|
218 |
+
for row in unprocessed_media:
|
219 |
+
media_id = row[0]
|
220 |
+
content = row[1]
|
221 |
+
media_type = row[2]
|
222 |
+
collection_name = f"{media_type}_{media_id}"
|
223 |
+
# FIXME
|
224 |
+
# def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
|
225 |
+
# create_embeddings: bool = False, create_summary: bool = False,
|
226 |
+
# api_name: str = None):
|
227 |
+
process_and_store_content(content, collection_name, media_id, "")
|
228 |
+
|
229 |
+
|
230 |
+
def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
|
231 |
+
all_collections = chroma_client.list_collections()
|
232 |
+
vector_results = []
|
233 |
+
for collection in all_collections:
|
234 |
+
collection_results = vector_search(collection.name, query, k=5)
|
235 |
+
filtered_results = [
|
236 |
+
result for result in collection_results
|
237 |
+
if relevant_media_ids is None or result['metadata'].get('media_id') in relevant_media_ids
|
238 |
+
]
|
239 |
+
vector_results.extend(filtered_results)
|
240 |
+
return vector_results
|
241 |
+
|
242 |
+
|
243 |
+
def perform_full_text_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
|
244 |
+
fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
|
245 |
+
filtered_fts_results = [
|
246 |
+
{
|
247 |
+
"content": result['content'],
|
248 |
+
"metadata": {"media_id": result['id']}
|
249 |
+
}
|
250 |
+
for result in fts_results
|
251 |
+
if relevant_media_ids is None or result['id'] in relevant_media_ids
|
252 |
+
]
|
253 |
+
return filtered_fts_results
|
254 |
+
|
255 |
+
|
256 |
+
def fetch_relevant_media_ids(keywords: List[str]) -> List[int]:
|
257 |
+
relevant_ids = set()
|
258 |
+
try:
|
259 |
+
for keyword in keywords:
|
260 |
+
media_ids = fetch_keywords_for_media(keyword)
|
261 |
+
relevant_ids.update(media_ids)
|
262 |
+
except Exception as e:
|
263 |
+
logging.error(f"Error fetching relevant media IDs: {str(e)}")
|
264 |
+
return list(relevant_ids)
|
265 |
+
|
266 |
+
|
267 |
+
def filter_results_by_keywords(results: List[Dict[str, Any]], keywords: List[str]) -> List[Dict[str, Any]]:
|
268 |
+
if not keywords:
|
269 |
+
return results
|
270 |
+
|
271 |
+
filtered_results = []
|
272 |
+
for result in results:
|
273 |
+
try:
|
274 |
+
metadata = result.get('metadata', {})
|
275 |
+
if metadata is None:
|
276 |
+
logging.warning(f"No metadata found for result: {result}")
|
277 |
+
continue
|
278 |
+
if not isinstance(metadata, dict):
|
279 |
+
logging.warning(f"Unexpected metadata type: {type(metadata)}. Expected dict.")
|
280 |
+
continue
|
281 |
+
|
282 |
+
media_id = metadata.get('media_id')
|
283 |
+
if media_id is None:
|
284 |
+
logging.warning(f"No media_id found in metadata: {metadata}")
|
285 |
+
continue
|
286 |
+
|
287 |
+
media_keywords = fetch_keywords_for_media(media_id)
|
288 |
+
if any(keyword.lower() in [mk.lower() for mk in media_keywords] for keyword in keywords):
|
289 |
+
filtered_results.append(result)
|
290 |
+
except Exception as e:
|
291 |
+
logging.error(f"Error processing result: {result}. Error: {str(e)}")
|
292 |
+
|
293 |
+
return filtered_results
|
294 |
+
|
295 |
+
# FIXME: to be implememted
|
296 |
+
def extract_media_id_from_result(result: str) -> Optional[int]:
|
297 |
+
# Implement this function based on how you store the media_id in your results
|
298 |
+
# For example, if it's stored at the beginning of each result:
|
299 |
+
try:
|
300 |
+
return int(result.split('_')[0])
|
301 |
+
except (IndexError, ValueError):
|
302 |
+
logging.error(f"Failed to extract media_id from result: {result}")
|
303 |
+
return None
|
304 |
+
|
305 |
+
|
306 |
+
|
307 |
+
|
308 |
+
# Example usage:
|
309 |
+
# 1. Initialize the system:
|
310 |
+
# create_tables(db) # Ensure FTS tables are set up
|
311 |
+
#
|
312 |
+
# 2. Create ChromaDB
|
313 |
+
# chroma_client = ChromaDBClient()
|
314 |
+
#
|
315 |
+
# 3. Create Embeddings
|
316 |
+
# Store embeddings in ChromaDB
|
317 |
+
# preprocess_all_content() or create_embeddings()
|
318 |
+
#
|
319 |
+
# 4. Perform RAG search across all content:
|
320 |
+
# result = rag_search("What are the key points about climate change?")
|
321 |
+
# print(result['answer'])
|
322 |
+
#
|
323 |
+
# (Extra)5. Perform RAG on a specific URL:
|
324 |
+
# result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
|
325 |
+
# print(result['answer'])
|
326 |
+
#
|
327 |
+
########################################################################################################################
|
328 |
+
|
329 |
+
|
330 |
+
############################################################################################################
|
331 |
+
#
|
332 |
+
# ElasticSearch Retriever
|
333 |
+
|
334 |
+
# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
|
335 |
+
#
|
336 |
+
# https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
|
337 |
+
|
338 |
+
#
|
339 |
+
# End of RAG_Library_2.py
|
340 |
+
############################################################################################################
|
App_Function_Libraries/Summarization/Summarization_General_Lib.py
CHANGED
@@ -710,11 +710,9 @@ def summarize_with_openrouter(api_key, input_data, custom_prompt_arg, temp=None,
|
|
710 |
|
711 |
def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None):
|
712 |
loaded_config_data = load_and_log_configs()
|
713 |
-
global huggingface_api_key
|
714 |
logging.debug("HuggingFace: Summarization process starting...")
|
715 |
try:
|
716 |
logging.debug("HuggingFace: Loading and validating configurations")
|
717 |
-
loaded_config_data = load_and_log_configs()
|
718 |
if loaded_config_data is None:
|
719 |
logging.error("Failed to load configuration data")
|
720 |
huggingface_api_key = None
|
@@ -726,6 +724,7 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None
|
|
726 |
else:
|
727 |
# If no parameter is provided, use the key from the config
|
728 |
huggingface_api_key = loaded_config_data['api_keys'].get('huggingface')
|
|
|
729 |
if huggingface_api_key:
|
730 |
logging.info("HuggingFace: Using API key from config file")
|
731 |
else:
|
@@ -738,7 +737,6 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None
|
|
738 |
# FIXME
|
739 |
# For example: raise ValueError("No valid Anthropic API key available")
|
740 |
|
741 |
-
|
742 |
logging.debug(f"HuggingFace: Using API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
|
743 |
|
744 |
if isinstance(input_data, str) and os.path.isfile(input_data):
|
@@ -775,21 +773,24 @@ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None
|
|
775 |
if temp is None:
|
776 |
temp = 0.1
|
777 |
temp = float(temp)
|
778 |
-
huggingface_prompt = f"{
|
779 |
logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
|
780 |
data = {
|
781 |
-
"inputs":
|
782 |
-
"
|
|
|
|
|
783 |
}
|
784 |
|
785 |
logging.debug("huggingface: Submitting request...")
|
786 |
response = requests.post(API_URL, headers=headers, json=data)
|
787 |
|
788 |
if response.status_code == 200:
|
789 |
-
|
|
|
790 |
logging.debug("huggingface: Summarization successful")
|
791 |
-
print("
|
792 |
-
return
|
793 |
else:
|
794 |
logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
|
795 |
return f"Failed to process summary, status code {response.status_code}: {response.text}"
|
|
|
710 |
|
711 |
def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None):
|
712 |
loaded_config_data = load_and_log_configs()
|
|
|
713 |
logging.debug("HuggingFace: Summarization process starting...")
|
714 |
try:
|
715 |
logging.debug("HuggingFace: Loading and validating configurations")
|
|
|
716 |
if loaded_config_data is None:
|
717 |
logging.error("Failed to load configuration data")
|
718 |
huggingface_api_key = None
|
|
|
724 |
else:
|
725 |
# If no parameter is provided, use the key from the config
|
726 |
huggingface_api_key = loaded_config_data['api_keys'].get('huggingface')
|
727 |
+
logging.debug(f"HuggingFace: API key from config: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
|
728 |
if huggingface_api_key:
|
729 |
logging.info("HuggingFace: Using API key from config file")
|
730 |
else:
|
|
|
737 |
# FIXME
|
738 |
# For example: raise ValueError("No valid Anthropic API key available")
|
739 |
|
|
|
740 |
logging.debug(f"HuggingFace: Using API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
|
741 |
|
742 |
if isinstance(input_data, str) and os.path.isfile(input_data):
|
|
|
773 |
if temp is None:
|
774 |
temp = 0.1
|
775 |
temp = float(temp)
|
776 |
+
huggingface_prompt = f"{custom_prompt_arg}\n\n\n{text}"
|
777 |
logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
|
778 |
data = {
|
779 |
+
"inputs": huggingface_prompt,
|
780 |
+
"max_tokens": 4096,
|
781 |
+
"stream": False,
|
782 |
+
"temperature": temp
|
783 |
}
|
784 |
|
785 |
logging.debug("huggingface: Submitting request...")
|
786 |
response = requests.post(API_URL, headers=headers, json=data)
|
787 |
|
788 |
if response.status_code == 200:
|
789 |
+
print(response.json())
|
790 |
+
chat_response = response.json()[0]['generated_text'].strip()
|
791 |
logging.debug("huggingface: Summarization successful")
|
792 |
+
print("Chat request successful.")
|
793 |
+
return chat_response
|
794 |
else:
|
795 |
logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
|
796 |
return f"Failed to process summary, status code {response.status_code}: {response.text}"
|
App_Function_Libraries/Utils/Utils.py
CHANGED
@@ -18,6 +18,7 @@
|
|
18 |
#
|
19 |
#
|
20 |
####################
|
|
|
21 |
# Import necessary libraries
|
22 |
import configparser
|
23 |
import hashlib
|
@@ -29,14 +30,15 @@ import time
|
|
29 |
from datetime import timedelta
|
30 |
from typing import Union, AnyStr
|
31 |
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
32 |
-
|
|
|
33 |
import requests
|
34 |
import unicodedata
|
35 |
from tqdm import tqdm
|
36 |
-
|
37 |
#######################################################################################################################
|
38 |
-
# Function Definitions
|
39 |
#
|
|
|
40 |
|
41 |
def extract_text_from_segments(segments):
|
42 |
logging.debug(f"Segments received: {segments}")
|
@@ -63,10 +65,6 @@ def extract_text_from_segments(segments):
|
|
63 |
logging.error(f"Unable to extract text from segments: {segments}")
|
64 |
return "Error: Unable to extract transcription"
|
65 |
|
66 |
-
def import_data(file):
|
67 |
-
# Implement this function to import data from a file
|
68 |
-
pass
|
69 |
-
|
70 |
#
|
71 |
#
|
72 |
#######################
|
@@ -124,23 +122,35 @@ def load_comprehensive_config():
|
|
124 |
|
125 |
|
126 |
def get_project_root():
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
129 |
|
130 |
def get_database_dir():
|
131 |
-
|
|
|
|
|
|
|
132 |
|
133 |
def get_database_path(db_name: Union[str, os.PathLike[AnyStr]]) -> str:
|
134 |
"""Get the full path for a database file."""
|
135 |
-
|
|
|
|
|
136 |
|
137 |
def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
|
138 |
"""Convert a relative path to a path relative to the project root."""
|
139 |
-
|
|
|
|
|
140 |
|
141 |
def get_chromadb_path():
|
142 |
-
|
143 |
-
|
|
|
144 |
|
145 |
def ensure_directory_exists(path):
|
146 |
"""Ensure that a directory exists, creating it if necessary."""
|
@@ -676,3 +686,18 @@ def get_db_config():
|
|
676 |
#
|
677 |
# End of DB Config Loading
|
678 |
#######################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
#
|
19 |
#
|
20 |
####################
|
21 |
+
#
|
22 |
# Import necessary libraries
|
23 |
import configparser
|
24 |
import hashlib
|
|
|
30 |
from datetime import timedelta
|
31 |
from typing import Union, AnyStr
|
32 |
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
33 |
+
#
|
34 |
+
# Non-Local Imports
|
35 |
import requests
|
36 |
import unicodedata
|
37 |
from tqdm import tqdm
|
38 |
+
#
|
39 |
#######################################################################################################################
|
|
|
40 |
#
|
41 |
+
# Function Definitions
|
42 |
|
43 |
def extract_text_from_segments(segments):
|
44 |
logging.debug(f"Segments received: {segments}")
|
|
|
65 |
logging.error(f"Unable to extract text from segments: {segments}")
|
66 |
return "Error: Unable to extract transcription"
|
67 |
|
|
|
|
|
|
|
|
|
68 |
#
|
69 |
#
|
70 |
#######################
|
|
|
122 |
|
123 |
|
124 |
def get_project_root():
|
125 |
+
# Get the directory of the current file (Utils.py)
|
126 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
127 |
+
# Go up two levels to reach the project root
|
128 |
+
# Assuming the structure is: project_root/App_Function_Libraries/Utils/Utils.py
|
129 |
+
project_root = os.path.dirname(os.path.dirname(current_dir))
|
130 |
+
return project_root
|
131 |
|
132 |
def get_database_dir():
|
133 |
+
"""Get the database directory (/tldw/Databases/)."""
|
134 |
+
db_dir = os.path.join(get_project_root(), 'Databases')
|
135 |
+
logging.debug(f"Database directory: {db_dir}")
|
136 |
+
return db_dir
|
137 |
|
138 |
def get_database_path(db_name: Union[str, os.PathLike[AnyStr]]) -> str:
|
139 |
"""Get the full path for a database file."""
|
140 |
+
path = os.path.join(get_database_dir(), str(db_name))
|
141 |
+
logging.debug(f"Database path for {db_name}: {path}")
|
142 |
+
return path
|
143 |
|
144 |
def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
|
145 |
"""Convert a relative path to a path relative to the project root."""
|
146 |
+
path = os.path.join(get_project_root(), str(relative_path))
|
147 |
+
logging.debug(f"Project relative path for {relative_path}: {path}")
|
148 |
+
return path
|
149 |
|
150 |
def get_chromadb_path():
|
151 |
+
path = os.path.join(get_project_root(), 'Databases', 'chroma_db')
|
152 |
+
logging.debug(f"ChromaDB path: {path}")
|
153 |
+
return path
|
154 |
|
155 |
def ensure_directory_exists(path):
|
156 |
"""Ensure that a directory exists, creating it if necessary."""
|
|
|
686 |
#
|
687 |
# End of DB Config Loading
|
688 |
#######################################################################################################################
|
689 |
+
|
690 |
+
def format_text_with_line_breaks(text):
|
691 |
+
# Split the text into sentences and add line breaks
|
692 |
+
sentences = text.replace('. ', '.<br>').replace('? ', '?<br>').replace('! ', '!<br>')
|
693 |
+
return sentences
|
694 |
+
|
695 |
+
#######################################################################################################################
|
696 |
+
#
|
697 |
+
# File Handling Functions
|
698 |
+
|
699 |
+
|
700 |
+
|
701 |
+
#
|
702 |
+
# End of File Handling Functions
|
703 |
+
#######################################################################################################################
|
App_Function_Libraries/Utils/__init__.py
CHANGED
@@ -1,5 +0,0 @@
|
|
1 |
-
from .Utils import is_valid_url, load_and_log_configs, extract_text_from_segments, load_comprehensive_config, format_metadata_as_text, convert_to_seconds, save_to_file,\
|
2 |
-
save_segments_to_json, download_file, create_download_directory, safe_read_file, generate_unique_filename, generate_unique_identifier, is_valid_url, verify_checksum,\
|
3 |
-
normalize_title, clean_youtube_url, sanitize_filename, format_transcription, format_file_path, get_db_config
|
4 |
-
|
5 |
-
downloaded_files = []
|
|
|
|
|
|
|
|
|
|
|
|