Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Sep 27, 2024

Commit

a324812

verified ·

1 Parent(s): 04171c3

Upload 3 files

Browse files

Files changed (3) hide show

App_Function_Libraries/RAG/ChromaDB_Library.py +87 -16
App_Function_Libraries/RAG/Embeddings_Create.py +1 -35
App_Function_Libraries/RAG/RAG_Libary_2.py +103 -99

App_Function_Libraries/RAG/ChromaDB_Library.py CHANGED Viewed

@@ -11,6 +11,7 @@ from itertools import islice
 #
 # Local Imports:
 from App_Function_Libraries.Chunk_Lib import chunk_for_embedding, chunk_options
 from App_Function_Libraries.DB.SQLite_DB import process_chunks
 from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
 # FIXME - related to Chunking
@@ -47,6 +48,40 @@ embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
 #
 # Functions:
 def batched(iterable, n):
     "Batch data into lists of length n. The last batch may be shorter."
     it = iter(iterable)
@@ -57,27 +92,55 @@ def batched(iterable, n):
         yield batch
-# FIXME - Fix summarization of entire document/storign in chunk issue
 # FIXME - update all uses to reflect 'api_name' parameter
 def process_and_store_content(database, content: str, collection_name: str, media_id: int, file_name: str,
-                              create_embeddings: bool = False, create_summary: bool = False, api_name: str = None,
-                              chunk_options: Dict = None, embedding_provider: str = None,
                               embedding_model: str = None, embedding_api_url: str = None):
     try:
         logger.info(f"Processing content for media_id {media_id} in collection {collection_name}")
-        full_summary = None
-        if create_summary and api_name:
-            full_summary = summarize(content, None, api_name, None, None, None)
-        chunks = chunk_for_embedding(content, file_name, full_summary, chunk_options)
         # Process chunks synchronously
         process_chunks(database, chunks, media_id)
         if create_embeddings:
-            texts = [chunk['text'] for chunk in chunks]
-            embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
             ids = [f"{media_id}_chunk_{i}" for i in range(1, len(chunks) + 1)]
             metadatas = [{
                 "media_id": str(media_id),
@@ -85,11 +148,17 @@ def process_and_store_content(database, content: str, collection_name: str, medi
                 "total_chunks": len(chunks),
                 "start_index": int(chunk['metadata']['start_index']),
                 "end_index": int(chunk['metadata']['end_index']),
-                "file_name": str(file_name),
-                "relative_position": float(chunk['metadata']['relative_position'])
             } for i, chunk in enumerate(chunks, 1)]
-            store_in_chroma(collection_name, texts, embeddings, ids, metadatas)
         # Update full-text search index
         database.execute_query(
@@ -168,11 +237,13 @@ def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[Lis
         # Verify storage
         for doc_id in ids:
-            result = collection.get(ids=[doc_id], include=["embeddings"])
             if not result['embeddings'] or result['embeddings'][0] is None:
                 logging.error(f"Failed to store embedding for {doc_id}")
             else:
                 logging.info(f"Embedding stored successfully for {doc_id}")
     except Exception as e:
         logging.error(f"Error storing embeddings in ChromaDB: {str(e)}")
@@ -194,9 +265,9 @@ def vector_search(collection_name: str, query: str, k: int = 10) -> List[Dict[st
         logging.error(f"Error in vector_search: {str(e)}")
         raise
-def schedule_embedding(media_id: int, content: str, media_name: str, summary: str):
     try:
-        chunks = chunk_for_embedding(content, media_name, summary, chunk_options)
         texts = [chunk['text'] for chunk in chunks]
         embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
         ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]

 #
 # Local Imports:
 from App_Function_Libraries.Chunk_Lib import chunk_for_embedding, chunk_options
+from App_Function_Libraries.DB.DB_Manager import get_unprocessed_media, mark_media_as_processed
 from App_Function_Libraries.DB.SQLite_DB import process_chunks
 from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
 # FIXME - related to Chunking
 #
 # Functions:
+# Function to preprocess and store all existing content in the database
+def preprocess_all_content(database, create_contextualized=True, api_name="gpt-3.5-turbo"):
+    unprocessed_media = get_unprocessed_media(db=database)
+    total_media = len(unprocessed_media)
+    for index, row in enumerate(unprocessed_media, 1):
+        media_id, content, media_type, file_name = row
+        collection_name = f"{media_type}_{media_id}"
+        logger.info(f"Processing media {index} of {total_media}: ID {media_id}, Type {media_type}")
+        try:
+            process_and_store_content(
+                database=database,
+                content=content,
+                collection_name=collection_name,
+                media_id=media_id,
+                file_name=file_name or f"{media_type}_{media_id}",
+                create_embeddings=True,
+                create_contextualized=create_contextualized,
+                api_name=api_name
+            )
+            # Mark the media as processed in the database
+            mark_media_as_processed(database, media_id)
+            logger.info(f"Successfully processed media ID {media_id}")
+        except Exception as e:
+            logger.error(f"Error processing media ID {media_id}: {str(e)}")
+    logger.info("Finished preprocessing all unprocessed content")
 def batched(iterable, n):
     "Batch data into lists of length n. The last batch may be shorter."
     it = iter(iterable)
         yield batch
+def situate_context(api_name, doc_content: str, chunk_content: str) -> str:
+    doc_content_prompt = f"""
+    <document>
+    {doc_content}
+    </document>
+    """
+    chunk_context_prompt = f"""
+    \n\n\n\n\n
+    Here is the chunk we want to situate within the whole document
+    <chunk>
+    {chunk_content}
+    </chunk>
+    Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
+    Answer only with the succinct context and nothing else.
+    """
+    response = summarize(chunk_context_prompt, doc_content_prompt, api_name, api_key=None, temp=0, system_message=None)
+    return response
 # FIXME - update all uses to reflect 'api_name' parameter
 def process_and_store_content(database, content: str, collection_name: str, media_id: int, file_name: str,
+                              create_embeddings: bool = True, create_contextualized: bool = True, api_name: str = "gpt-3.5-turbo",
+                              chunk_options = None, embedding_provider: str = None,
                               embedding_model: str = None, embedding_api_url: str = None):
     try:
         logger.info(f"Processing content for media_id {media_id} in collection {collection_name}")
+        chunks = chunk_for_embedding(content, file_name, chunk_options)
         # Process chunks synchronously
         process_chunks(database, chunks, media_id)
         if create_embeddings:
+            texts = []
+            contextualized_chunks = []
+            for chunk in chunks:
+                chunk_text = chunk['text']
+                if create_contextualized:
+                    context = situate_context(api_name, content, chunk_text)
+                    contextualized_text = f"{chunk_text}\n\nContextual Summary: {context}"
+                    contextualized_chunks.append(contextualized_text)
+                else:
+                    contextualized_chunks.append(chunk_text)
+                texts.append(chunk_text)  # Store original text for database
+            embeddings = create_embeddings_batch(contextualized_chunks, embedding_provider, embedding_model, embedding_api_url)
             ids = [f"{media_id}_chunk_{i}" for i in range(1, len(chunks) + 1)]
             metadatas = [{
                 "media_id": str(media_id),
                 "total_chunks": len(chunks),
                 "start_index": int(chunk['metadata']['start_index']),
                 "end_index": int(chunk['metadata']['end_index']),
+                "file_name": str(chunk['metadata']['file_name']),
+                "relative_position": float(chunk['metadata']['relative_position']),
+                "contextualized": create_contextualized,
+                "original_text": chunk['text'],
+                "contextual_summary": contextualized_chunks[i-1].split("\n\nContextual Summary: ")[-1] if create_contextualized else ""
             } for i, chunk in enumerate(chunks, 1)]
+            store_in_chroma(collection_name, contextualized_chunks, embeddings, ids, metadatas)
+            # Mark the media as processed
+            mark_media_as_processed(database, media_id)
         # Update full-text search index
         database.execute_query(
         # Verify storage
         for doc_id in ids:
+            result = collection.get(ids=[doc_id], include=["documents", "embeddings", "metadatas"])
             if not result['embeddings'] or result['embeddings'][0] is None:
                 logging.error(f"Failed to store embedding for {doc_id}")
             else:
                 logging.info(f"Embedding stored successfully for {doc_id}")
+                logging.debug(f"Stored document: {result['documents'][0][:100]}...")
+                logging.debug(f"Stored metadata: {result['metadatas'][0]}")
     except Exception as e:
         logging.error(f"Error storing embeddings in ChromaDB: {str(e)}")
         logging.error(f"Error in vector_search: {str(e)}")
         raise
+def schedule_embedding(media_id: int, content: str, media_name: str):
     try:
+        chunks = chunk_for_embedding(content, media_name, chunk_options)
         texts = [chunk['text'] for chunk in chunks]
         embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
         ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]

App_Function_Libraries/RAG/Embeddings_Create.py CHANGED Viewed

@@ -35,7 +35,6 @@ overlap = loaded_config['Embeddings']['overlap']
 # FIXME - Add logging
 class HuggingFaceEmbedder:
     def __init__(self, model_name, timeout_seconds=120):  # Default timeout of 2 minutes
         self.model_name = model_name
@@ -154,6 +153,7 @@ List[List[float]]:
     else:
         raise ValueError(f"Unsupported embedding provider: {provider}")
 def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
     return create_embeddings_batch([text], provider, model, api_url)[0]
@@ -185,40 +185,6 @@ def create_openai_embedding(text: str, model: str) -> List[float]:
     embedding = get_openai_embeddings(text, model)
     return embedding
-#Dead
-# def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
-#     response = requests.post(
-#         api_url,
-#         json={"text": text, "model": model},
-#         headers={"Authorization": f"Bearer {api_key}"}
-#     )
-#     response.raise_for_status()
-#     return response.json().get('embedding', None)
-# Dead
-# def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
-#     response = requests.post(
-#         api_url,
-#         json={"input": text}
-#     )
-#     response.raise_for_status()
-#     return response.json()['embedding']
-# dead
-# def create_huggingface_embedding(text: str, model: str) -> List[float]:
-#     tokenizer = AutoTokenizer.from_pretrained(model)
-#     model = AutoModel.from_pretrained(model)
-#
-#     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
-#     with torch.no_grad():
-#         outputs = model(**inputs)
-#
-#     embeddings = outputs.last_hidden_state.mean(dim=1)
-#     return embeddings[0].tolist()
 #
 # End of File.
 #######################################################################################################################

 # FIXME - Add logging
 class HuggingFaceEmbedder:
     def __init__(self, model_name, timeout_seconds=120):  # Default timeout of 2 minutes
         self.model_name = model_name
     else:
         raise ValueError(f"Unsupported embedding provider: {provider}")
 def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
     return create_embeddings_batch([text], provider, model, api_url)[0]
     embedding = get_openai_embeddings(text, model)
     return embedding
 #
 # End of File.
 #######################################################################################################################

App_Function_Libraries/RAG/RAG_Libary_2.py CHANGED Viewed

@@ -9,8 +9,7 @@ from typing import Dict, Any, List, Optional
 # Local Imports
 from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
 from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
-from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
-    fetch_keywords_for_media
 from App_Function_Libraries.Utils.Utils import load_comprehensive_config
 #
 # 3rd-Party Imports
@@ -32,71 +31,79 @@ config = configparser.ConfigParser()
 # Read the configuration file
 config.read('config.txt')
-# Main RAG pipeline function
-def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
-    try:
-        # Extract content
-        try:
-            article_data = scrape_article(url)
-            content = article_data['content']
-            title = article_data['title']
-        except Exception as e:
-            logging.error(f"Error scraping article: {str(e)}")
-            return {"error": "Failed to scrape article", "details": str(e)}
-        # Store the article in the database and get the media_id
-        try:
-            media_id = add_media_to_database(url, title, 'article', content)
-        except Exception as e:
-            logging.error(f"Error adding article to database: {str(e)}")
-            return {"error": "Failed to store article in database", "details": str(e)}
-        # Process and store content
-        collection_name = f"article_{media_id}"
-        try:
-            # FIXME
-            # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
-            #                               create_embeddings: bool = False, create_summary: bool = False,
-            #                               api_name: str = None):
-            process_and_store_content(content, collection_name, media_id, title)
-        except Exception as e:
-            logging.error(f"Error processing and storing content: {str(e)}")
-            return {"error": "Failed to process and store content", "details": str(e)}
-        # Perform searches
-        try:
-            vector_results = vector_search(collection_name, query, k=5)
-            fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
-        except Exception as e:
-            logging.error(f"Error performing searches: {str(e)}")
-            return {"error": "Failed to perform searches", "details": str(e)}
-        # Combine results with error handling for missing 'content' key
-        all_results = []
-        for result in vector_results + fts_results:
-            if isinstance(result, dict) and 'content' in result:
-                all_results.append(result['content'])
-            else:
-                logging.warning(f"Unexpected result format: {result}")
-                all_results.append(str(result))
-        context = "\n".join(all_results)
-        # Generate answer using the selected API
-        try:
-            answer = generate_answer(api_choice, context, query)
-        except Exception as e:
-            logging.error(f"Error generating answer: {str(e)}")
-            return {"error": "Failed to generate answer", "details": str(e)}
-        return {
-            "answer": answer,
-            "context": context
-        }
-    except Exception as e:
-        logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
-        return {"error": "An unexpected error occurred", "details": str(e)}
@@ -213,21 +220,6 @@ def generate_answer(api_choice: str, context: str, query: str) -> str:
     else:
         raise ValueError(f"Unsupported API choice: {api_choice}")
-# Function to preprocess and store all existing content in the database
-def preprocess_all_content():
-    unprocessed_media = get_unprocessed_media()
-    for row in unprocessed_media:
-        media_id = row[0]
-        content = row[1]
-        media_type = row[2]
-        collection_name = f"{media_type}_{media_id}"
-        # FIXME
-        # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
-        #                               create_embeddings: bool = False, create_summary: bool = False,
-        #                               api_name: str = None):
-        process_and_store_content(content, collection_name, media_id, "")
 def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
     all_collections = chroma_client.list_collections()
     vector_results = []
@@ -303,30 +295,42 @@ def extract_media_id_from_result(result: str) -> Optional[int]:
         logging.error(f"Failed to extract media_id from result: {result}")
         return None
-# Example usage:
-# 1. Initialize the system:
-# create_tables(db)  # Ensure FTS tables are set up
 #
-# 2. Create ChromaDB
-# chroma_client = ChromaDBClient()
 #
-# 3. Create Embeddings
-# Store embeddings in ChromaDB
-# preprocess_all_content() or create_embeddings()
 #
-# 4. Perform RAG search across all content:
-# result = rag_search("What are the key points about climate change?")
-# print(result['answer'])
 #
-# (Extra)5. Perform RAG on a specific URL:
-# result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
-# print(result['answer'])
 #
-########################################################################################################################
 ############################################################################################################
 #

 # Local Imports
 from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
 from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
+from App_Function_Libraries.DB.DB_Manager import search_db, fetch_keywords_for_media
 from App_Function_Libraries.Utils.Utils import load_comprehensive_config
 #
 # 3rd-Party Imports
 # Read the configuration file
 config.read('config.txt')
+# RAG pipeline function for web scraping
+# def rag_web_scraping_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
+#     try:
+#         # Extract content
+#         try:
+#             article_data = scrape_article(url)
+#             content = article_data['content']
+#             title = article_data['title']
+#         except Exception as e:
+#             logging.error(f"Error scraping article: {str(e)}")
+#             return {"error": "Failed to scrape article", "details": str(e)}
+#
+#         # Store the article in the database and get the media_id
+#         try:
+#             media_id = add_media_to_database(url, title, 'article', content)
+#         except Exception as e:
+#             logging.error(f"Error adding article to database: {str(e)}")
+#             return {"error": "Failed to store article in database", "details": str(e)}
+#
+#         # Process and store content
+#         collection_name = f"article_{media_id}"
+#         try:
+#             # Assuming you have a database object available, let's call it 'db'
+#             db = get_database_connection()
+#
+#             process_and_store_content(
+#                 database=db,
+#                 content=content,
+#                 collection_name=collection_name,
+#                 media_id=media_id,
+#                 file_name=title,
+#                 create_embeddings=True,
+#                 create_contextualized=True,
+#                 api_name=api_choice
+#             )
+#         except Exception as e:
+#             logging.error(f"Error processing and storing content: {str(e)}")
+#             return {"error": "Failed to process and store content", "details": str(e)}
+#
+#         # Perform searches
+#         try:
+#             vector_results = vector_search(collection_name, query, k=5)
+#             fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
+#         except Exception as e:
+#             logging.error(f"Error performing searches: {str(e)}")
+#             return {"error": "Failed to perform searches", "details": str(e)}
+#
+#         # Combine results with error handling for missing 'content' key
+#         all_results = []
+#         for result in vector_results + fts_results:
+#             if isinstance(result, dict) and 'content' in result:
+#                 all_results.append(result['content'])
+#             else:
+#                 logging.warning(f"Unexpected result format: {result}")
+#                 all_results.append(str(result))
+#
+#         context = "\n".join(all_results)
+#
+#         # Generate answer using the selected API
+#         try:
+#             answer = generate_answer(api_choice, context, query)
+#         except Exception as e:
+#             logging.error(f"Error generating answer: {str(e)}")
+#             return {"error": "Failed to generate answer", "details": str(e)}
+#
+#         return {
+#             "answer": answer,
+#             "context": context
+#         }
+#
+#     except Exception as e:
+#         logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
+#         return {"error": "An unexpected error occurred", "details": str(e)}
     else:
         raise ValueError(f"Unsupported API choice: {api_choice}")
 def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
     all_collections = chroma_client.list_collections()
     vector_results = []
         logging.error(f"Failed to extract media_id from result: {result}")
         return None
+#
+#
+########################################################################################################################
+# Function to preprocess and store all existing content in the database
+# def preprocess_all_content(database, create_contextualized=True, api_name="gpt-3.5-turbo"):
+#     unprocessed_media = get_unprocessed_media()
+#     total_media = len(unprocessed_media)
 #
+#     for index, row in enumerate(unprocessed_media, 1):
+#         media_id, content, media_type, file_name = row
+#         collection_name = f"{media_type}_{media_id}"
 #
+#         logger.info(f"Processing media {index} of {total_media}: ID {media_id}, Type {media_type}")
 #
+#         try:
+#             process_and_store_content(
+#                 database=database,
+#                 content=content,
+#                 collection_name=collection_name,
+#                 media_id=media_id,
+#                 file_name=file_name or f"{media_type}_{media_id}",
+#                 create_embeddings=True,
+#                 create_contextualized=create_contextualized,
+#                 api_name=api_name
+#             )
 #
+#             # Mark the media as processed in the database
+#             mark_media_as_processed(database, media_id)
 #
+#             logger.info(f"Successfully processed media ID {media_id}")
+#         except Exception as e:
+#             logger.error(f"Error processing media ID {media_id}: {str(e)}")
+#
+#     logger.info("Finished preprocessing all unprocessed content")
 ############################################################################################################
 #