Spaces:

spark-ds549
/

LibRAG

Running

Daniel Foley commited on 28 days ago

Commit

364893a

1 Parent(s): a9e136f

Forgot to include everyone on last commit + old scripts

Co-authored By: Daniel dfoley3838@gmail.com
Co-authored By: Brandon bmv2021@bu.edu
Co-authored By: Enrico enricoll@bu.edu
Co-authored By: Jinanshi jinanshi@bu.edu

Files changed (8) hide show

old_scripts/app.py +213 -0
old_scripts/app1.1.py +85 -0
old_scripts/bpl_scraper.py +177 -0
old_scripts/faiss_migrate.ipynb +179 -0
old_scripts/new_streamlit.app +152 -0
old_scripts/new_streamlit.py +188 -0
old_scripts/streamlit-rag-app.py +185 -0
old_scripts/test_streamlit.py +15 -0

old_scripts/app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+from typing import List
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.chains import (
+    ConversationalRetrievalChain,
+)
+from langchain.chat_models import ChatOpenAI
+from langchain.docstore.document import Document
+from langchain.memory import ChatMessageHistory, ConversationBufferMemory
+import chainlit as cl
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+@cl.on_chat_start
+async def on_chat_start():
+    files = None
+    # Wait for the user to upload a file
+    while files == None:
+        files = await cl.AskFileMessage(
+            content="Please upload a text file to begin!",
+            accept=["text/plain"],
+            max_size_mb=20,
+            timeout=180,
+        ).send()
+    file = files[0]
+    msg = cl.Message(content=f"Processing `{file.name}`...")
+    await msg.send()
+    with open(file.path, "r", encoding="utf-8") as f:
+        text = f.read()
+    # Split the text into chunks
+    texts = text_splitter.split_text(text)
+    # Create a metadata for each chunk
+    metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
+    # Create a Chroma vector store
+    embeddings = OpenAIEmbeddings()
+    docsearch = await cl.make_async(Chroma.from_texts)(
+        texts, embeddings, metadatas=metadatas
+    )
+    message_history = ChatMessageHistory()
+    memory = ConversationBufferMemory(
+        memory_key="chat_history",
+        output_key="answer",
+        chat_memory=message_history,
+        return_messages=True,
+    )
+    # Create a chain that uses the Chroma vector store
+    chain = ConversationalRetrievalChain.from_llm(
+        ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
+        chain_type="stuff",
+        retriever=docsearch.as_retriever(),
+        memory=memory,
+        return_source_documents=True,
+    )
+    # Let the user know that the system is ready
+    msg.content = f"Processing `{file.name}` done. You can now ask questions!"
+    await msg.update()
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def main(message: cl.Message):
+    chain = cl.user_session.get("chain")  # type: ConversationalRetrievalChain
+    cb = cl.AsyncLangchainCallbackHandler()
+    res = await chain.acall(message.content, callbacks=[cb])
+    answer = res["answer"]
+    source_documents = res["source_documents"]  # type: List[Document]
+    text_elements = []  # type: List[cl.Text]
+    if source_documents:
+        for source_idx, source_doc in enumerate(source_documents):
+            source_name = f"source_{source_idx}"
+            # Create the text element referenced in the message
+            text_elements.append(
+                cl.Text(content=source_doc.page_content, name=source_name, display="side")
+            )
+        source_names = [text_el.name for text_el in text_elements]
+        if source_names:
+            answer += f"\nSources: {', '.join(source_names)}"
+        else:
+            answer += "\nNo sources found"
+    await cl.Message(content=answer, elements=text_elements).send()

old_scripts/app1.1.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from dotenv import load_dotenv  # Import dotenv to load environment variables
+import os
+import chainlit as cl
+from langchain.chains import RetrievalQA
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import Document
+from langchain.embeddings import HuggingFaceEmbeddings
+import json
+# Load environment variables from .env file
+load_dotenv()
+# Get the OpenAI API key from the environment
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    raise ValueError("OPENAI_API_KEY is not set. Please add it to your .env file.")
+# Global variables for vector store and QA chain
+vector_store = None
+qa_chain = None
+# Step 1: Load and Process JSON Data
+def load_json_file(file_path):
+    with open(file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    return data
+def setup_vector_store_from_json(json_data):
+    # Create Document objects with URLs and content
+    documents = [Document(page_content=item["content"], metadata={"url": item["url"]}) for item in json_data]
+    # Create embeddings and store them in FAISS
+    #embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+    vector_store = FAISS.from_documents(documents, embeddings)
+    return vector_store
+def setup_qa_chain(vector_store):
+    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
+    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
+    return qa_chain
+# Initialize Chainlit: Preload data when the chat starts
+@cl.on_chat_start
+async def chat_start():
+    global vector_store, qa_chain
+    # Load and preprocess the JSON file
+    json_data = load_json_file("football_players.json")
+    vector_store = setup_vector_store_from_json(json_data)
+    qa_chain = setup_qa_chain(vector_store)
+    # Send a welcome message
+    await cl.Message(content="Welcome to the RAG app! Ask me any question based on the knowledge base.").send()
+# Process user queries
+@cl.on_message
+async def main(message: cl.Message):
+    global qa_chain
+    # Ensure the QA chain is ready
+    if qa_chain is None:
+        await cl.Message(content="The app is still initializing. Please wait a moment and try again.").send()
+        return
+    # Get query from the user and run the QA chain
+    query = message.content
+    response = qa_chain({"query": query})
+    # Extract the answer and source documents
+    answer = response["result"]
+    sources = response["source_documents"]
+    # Format and send the response
+    await cl.Message(content=f"**Answer:** {answer}").send()
+    if sources:
+        await cl.Message(content="**Sources:**").send()
+        for i, doc in enumerate(sources, 1):
+            url = doc.metadata.get("url", "No URL available")
+            await cl.Message(content=f"**Source {i}:** {doc.page_content}\n**URL:** {url}").send()

old_scripts/bpl_scraper.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import requests
+from bs4 import BeautifulSoup
+import os
+import json
+import re
+from typing import List, Dict
+import logging
+from urllib.parse import urljoin, urlparse
+class DigitalCommonwealthScraper:
+    def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
+        """
+        Initialize the scraper with base URL and logging
+        :param base_url: Base URL for Digital Commonwealth
+        """
+        self.base_url = base_url
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        # Headers to mimic browser request
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    def fetch_page(self, url: str) -> requests.Response:
+        """
+        Fetch webpage content with error handling
+        :param url: URL to fetch
+        :return: Response object
+        """
+        try:
+            response = requests.get(url, headers=self.headers)
+            response.raise_for_status()
+            return response
+        except requests.RequestException as e:
+            self.logger.error(f"Error fetching {url}: {e}")
+            return None
+    def extract_json_metadata(self, url: str) -> Dict:
+        """
+        Extract JSON metadata from the page
+        :param url: URL of the page
+        :return: Dictionary of metadata
+        """
+        json_url = f"{url}.json"
+        response = self.fetch_page(json_url)
+        if response:
+            try:
+                return response.json()
+            except json.JSONDecodeError:
+                self.logger.error(f"Could not parse JSON from {json_url}")
+                return {}
+        return {}
+    def extract_images(self, url: str) -> List[Dict]:
+        """
+        Extract images from the page
+        :param url: URL of the page to scrape
+        :return: List of image dictionaries
+        """
+        # Fetch page content
+        response = self.fetch_page(url)
+        if not response:
+            return []
+        # Parse HTML
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract JSON metadata
+        metadata = self.extract_json_metadata(url)
+        # List to store images
+        images = []
+        # Strategy 1: Look for image viewers or specific image containers
+        image_containers = [
+            soup.find('div', class_='viewer-container'),
+            soup.find('div', class_='image-viewer'),
+            soup.find('div', id='image-container')
+        ]
+        # Strategy 2: Find all image tags
+        img_tags = soup.find_all('img')
+        # Combine image sources
+        for img in img_tags:
+            # Get image source
+            src = img.get('src')
+            if not src:
+                continue
+            # Resolve relative URLs
+            full_src = urljoin(url, src)
+            # Extract alt text or use filename
+            alt = img.get('alt', os.path.basename(urlparse(full_src).path))
+            # Create image dictionary
+            image_info = {
+                'url': full_src,
+                'alt': alt,
+                'source_page': url
+            }
+            # Try to add metadata if available
+            if metadata:
+                try:
+                    # Extract relevant metadata from JSON if possible
+                    image_info['metadata'] = {
+                        'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
+                        'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
+                        'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
+                    }
+                except Exception as e:
+                    self.logger.warning(f"Error extracting metadata: {e}")
+            images.append(image_info)
+        return images
+    def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
+        """
+        Download images to local directory
+        :param images: List of image dictionaries
+        :param output_dir: Directory to save images
+        :return: List of downloaded file paths
+        """
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+        downloaded_files = []
+        for i, image in enumerate(images):
+            try:
+                response = requests.get(image['url'], headers=self.headers)
+                response.raise_for_status()
+                # Generate filename
+                ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
+                filename = os.path.join(output_dir, f'image_{i}{ext}')
+                with open(filename, 'wb') as f:
+                    f.write(response.content)
+                downloaded_files.append(filename)
+                self.logger.info(f"Downloaded: {filename}")
+            except Exception as e:
+                self.logger.error(f"Error downloading {image['url']}: {e}")
+        return downloaded_files
+#def main():
+    # Example usage
+ #   scraper = DigitalCommonwealthScraper()
+  #
+    # Example URL from input
+   # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
+    # Extract images
+    #images = scraper.extract_images(url)
+    # Print image information
+    #for img in images:
+     #   print(json.dumps(img, indent=2))
+    # Optional: Download images
+    #scraper.download_images(images)
+#if __name__ == "__main__":
+ #   main()

old_scripts/faiss_migrate.ipynb ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Used this to migrate vectors to pinecone from our faiss indices. I recommend you use our scripts to ingest your data directly into Pinecone. For this, direct it to a folder containing the index.faiss and index.pkl files that you want to ingest into pinecone."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\dfole\\Desktop\\CS549\\pinecone_venv\\Lib\\site-packages\\pinecone\\data\\index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "import time\n",
+    "from pinecone import Pinecone, ServerlessSpec\n",
+    "\n",
+    "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
+    "\n",
+    "pc = Pinecone(api_key=pinecone_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_huggingface import HuggingFaceEmbeddings\n",
+    "\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 4685/4685 [1:57:28<00:00,  1.50s/it]  \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully migrated 468455 documents to Pinecone index 'bpl-rag'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from langchain_community.vectorstores import FAISS\n",
+    "from pinecone import Pinecone, ServerlessSpec\n",
+    "from langchain_community.embeddings import OpenAIEmbeddings\n",
+    "from tqdm import tqdm\n",
+    "from langchain_pinecone import PineconeVectorStore\n",
+    "\n",
+    "def migrate_faiss_to_pinecone(\n",
+    "    faiss_index_path: str,\n",
+    "    pinecone_api_key: str,\n",
+    "    index_name: str,\n",
+    "    batch_size: int = 100\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Migrate a local FAISS index to Pinecone.\n",
+    "    \n",
+    "    Args:\n",
+    "        faiss_index_path: Path to the local FAISS index\n",
+    "        pinecone_api_key: Your Pinecone API key\n",
+    "        pinecone_environment: Pinecone environment (e.g., \"us-east1-gcp\")\n",
+    "        index_name: Name of the Pinecone index to create/use\n",
+    "        batch_size: Number of vectors to upload in each batch\n",
+    "    \"\"\"\n",
+    "    # Load the local FAISS index\n",
+    "    embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+    "    faiss_vectorstore = FAISS.load_local(faiss_index_path, embeddings,allow_dangerous_deserialization=True)\n",
+    "    pc = Pinecone(api_key=pinecone_api_key)\n",
+    "\n",
+    "    index = pc.Index(index_name)\n",
+    "    \n",
+    "    # Get all the vectors and documents from FAISS\n",
+    "    all_docs = faiss_vectorstore.docstore._dict\n",
+    "    docs = dict()\n",
+    "\n",
+    "    for uuid in faiss_vectorstore.docstore._dict:\n",
+    "        doc = faiss_vectorstore.docstore._dict[uuid]\n",
+    "        # print(doc)\n",
+    "        if doc.metadata['field'] in ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim']:\n",
+    "            if len(doc.page_content) > 3:\n",
+    "                docs[uuid] = doc\n",
+    "\n",
+    "    total_docs = len(docs)\n",
+    "    \n",
+    "    pinecone_vectorstore = PineconeVectorStore(index=index, embedding=embeddings)\n",
+    "\n",
+    "    # Batch processing\n",
+    "    for i in tqdm(range(0, total_docs, batch_size)):\n",
+    "        batch_ids = list(docs.keys())[i:i + batch_size]\n",
+    "        batch_docs = [docs[doc_id] for doc_id in batch_ids]\n",
+    "        batch_embeddings = [faiss_vectorstore.index.reconstruct(j).tolist() \n",
+    "                          for j in range(i, min(i + batch_size, total_docs))]\n",
+    "        \n",
+    "        # Create metadata for each document\n",
+    "        metadatas = [doc.metadata for doc in batch_docs]\n",
+    "        texts = [doc.page_content for doc in batch_docs]\n",
+    "        # print(batch_docs)\n",
+    "        # Add vectors to Pinecone\n",
+    "        pinecone_vectorstore.add_texts(\n",
+    "            texts=texts,\n",
+    "            metadatas=metadatas,\n",
+    "            embeddings=batch_embeddings,\n",
+    "            ids=batch_ids\n",
+    "        )\n",
+    "    \n",
+    "    print(f\"Successfully migrated {total_docs} documents to Pinecone index '{index_name}'\")\n",
+    "    return pinecone_vectorstore\n",
+    "\n",
+    "# Example usage:\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Set your credentials and paths\n",
+    "    FAISS_INDEX_PATH = \"faiss_900_1200\"\n",
+    "    PINECONE_API_KEY = \"pcsk_47kPH2_665LiydNVZXrhKkZgx7eNJ5bjEChMWhp6Vx2fUrShiNXRZ2rSCdonUiAkUTDJ7n\"\n",
+    "    INDEX_NAME = \"bpl-rag\"\n",
+    "    \n",
+    "    # Perform migration\n",
+    "    pinecone_vs = migrate_faiss_to_pinecone(\n",
+    "        faiss_index_path=FAISS_INDEX_PATH,\n",
+    "        pinecone_api_key=PINECONE_API_KEY,\n",
+    "        index_name=INDEX_NAME,\n",
+    "        batch_size=100\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pinecone_venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

old_scripts/new_streamlit.app ADDED Viewed

	@@ -0,0 +1,152 @@

+import streamlit as st
+import os
+from typing import List, Tuple, Optional
+from pinecone import Pinecone
+from langchain_pinecone import PineconeVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from dotenv import load_dotenv
+from RAG import RAG
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="RAG Chatbot",
+    page_icon="🤖",
+    layout="wide"
+)
+def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]:
+    """Initialize the language model and embeddings."""
+    try:
+        load_dotenv()
+        # Initialize OpenAI model
+        llm = ChatOpenAI(
+            model="gpt-4",  # Changed from gpt-4o-mini which appears to be a typo
+            temperature=0,
+            timeout=60,  # Added reasonable timeout
+            max_retries=2
+        )
+        # Initialize embeddings
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        return llm, embeddings
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        st.error(f"Failed to initialize models: {str(e)}")
+        return None, None
+def process_message(
+    query: str,
+    llm: ChatOpenAI,
+    index_name: str,
+    embeddings: HuggingFaceEmbeddings
+) -> Tuple[str, List]:
+    """Process the user message using the RAG system."""
+    try:
+        response, sources = RAG(
+            query=query,
+            llm=llm,
+            index_name=index_name,
+            embeddings=embeddings
+        )
+        return response, sources
+    except Exception as e:
+        logger.error(f"Error in process_message: {str(e)}")
+        return f"Error processing message: {str(e)}", []
+def display_sources(sources: List) -> None:
+    """Display sources in expandable sections with proper formatting and custom URLs."""
+    if not sources:
+        st.info("No sources available for this response.")
+        return
+    st.subheader("Sources")
+    for i, doc in enumerate(sources, 1):
+        try:
+            with st.expander(f"Source {i}"):
+                if hasattr(doc, 'page_content'):
+                    st.markdown(f"**Content:** {doc.page_content}")
+                    if hasattr(doc, 'metadata'):
+                        # Construct URL from source metadata
+                        # Display other metadata
+                        for key, value in doc.metadata.items():
+                            if key != 'source':  # Skip source since we already used it for URL
+                                st.markdown(f"**{key.title()}:** {value}")
+                else:
+                    st.markdown(f"**Content:** {str(doc)}")
+        except Exception as e:
+            logger.error(f"Error displaying source {i}: {str(e)}")
+            st.error(f"Error displaying source {i}")
+def main():
+    st.title("RAG Chatbot")
+    # Initialize session state
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Initialize models
+    llm, embeddings = initialize_models()
+    if not llm or not embeddings:
+        st.error("Failed to initialize the application. Please check the logs.")
+        return
+    # Constants
+    INDEX_NAME = 'bpl-rag'
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    user_input = st.chat_input("Type your message here...")
+    if user_input:
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        # Process and display assistant response
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                response, sources = process_message(
+                    query=user_input,
+                    llm=llm,
+                    index_name=INDEX_NAME,
+                    embeddings=embeddings
+                )
+                if isinstance(response, str):
+                    st.markdown(response)
+                    st.session_state.messages.append({
+                        "role": "assistant",
+                        "content": response
+                    })
+                    # Display sources
+                    display_sources(sources)
+                else:
+                    st.error("Received an invalid response format")
+    # Footer
+    st.markdown("---")
+    st.markdown(
+        "Built with ❤️ using Streamlit + LangChain + OpenAI",
+        help="An AI-powered chatbot with RAG capabilities"
+    )
+if __name__ == "__main__":
+    main()

old_scripts/new_streamlit.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import streamlit as st
+import os
+from typing import List, Tuple, Optional
+from pinecone import Pinecone
+from langchain_pinecone import PineconeVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from dotenv import load_dotenv
+from RAG import RAG
+from bpl_scraper import DigitalCommonwealthScraper
+import logging
+import json
+import shutil
+from PIL import Image
+import io
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="Boston Public Library Chatbot",
+    page_icon="🤖",
+    layout="wide"
+)
+def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]:
+    """Initialize the language model and embeddings."""
+    try:
+        load_dotenv()
+        # Initialize OpenAI model
+        llm = ChatOpenAI(
+            model="gpt-4",  # Changed from gpt-4o-mini which appears to be a typo
+            temperature=0,
+            timeout=60,  # Added reasonable timeout
+            max_retries=2
+        )
+        # Initialize embeddings
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        return llm, embeddings
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        st.error(f"Failed to initialize models: {str(e)}")
+        return None, None
+def process_message(
+    query: str,
+    llm: ChatOpenAI,
+    index_name: str,
+    embeddings: HuggingFaceEmbeddings
+) -> Tuple[str, List]:
+    """Process the user message using the RAG system."""
+    try:
+        response, sources = RAG(
+            query=query,
+            llm=llm,
+            index_name=index_name,
+            embeddings=embeddings
+        )
+        return response, sources
+    except Exception as e:
+        logger.error(f"Error in process_message: {str(e)}")
+        return f"Error processing message: {str(e)}", []
+def display_sources(sources: List) -> None:
+    """Display sources in expandable sections with proper formatting."""
+    if not sources:
+        st.info("No sources available for this response.")
+        return
+    st.subheader("Sources")
+    for i, doc in enumerate(sources, 1):
+        try:
+            with st.expander(f"Source {i}"):
+                if hasattr(doc, 'page_content'):
+                    st.markdown(f"**Content:** {doc.page_content[0:100] + ' ...'}")
+                    if hasattr(doc, 'metadata'):
+                        for key, value in doc.metadata.items():
+                            st.markdown(f"**{key.title()}:** {value}")
+                        # Web Scraper to display images of sources
+                        # Especially helpful if the sources are images themselves
+                        # or are OCR'd text files
+                        scraper = DigitalCommonwealthScraper()
+                        images = scraper.extract_images(doc.metadata["URL"])
+                        images = images[:1]
+                        # If there are no images then don't display them
+                        if not images:
+                                st.warning("No images found on the page.")
+                                return
+                        # Download the images
+                        # Delete the directory if it already exists
+                        # to clear the existing cache of images for each listed source
+                        output_dir = 'downloaded_images'
+                        if os.path.exists(output_dir):
+                            shutil.rmtree(output_dir)
+                        # Download the main image to a local directory
+                        downloaded_files = scraper.download_images(images)
+                        # Display the image using st.image
+                        # Display the title of the image using img.get
+                        st.image(downloaded_files, width=400, caption=[
+                            img.get('alt', f'Image {i+1}') for i, img in enumerate(images)
+                            ])
+                else:
+                    st.markdown(f"**Content:** {str(doc)}")
+        except Exception as e:
+            logger.error(f"Error displaying source {i}: {str(e)}")
+            st.error(f"Error displaying source {i}")
+def main():
+    st.title("Boston Public Library RAG Chatbot")
+    # Initialize session state
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Initialize models
+    llm, embeddings = initialize_models()
+    if not llm or not embeddings:
+        st.error("Failed to initialize the application. Please check the logs.")
+        return
+    # Constants
+    INDEX_NAME = 'bpl-rag'
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    user_input = st.chat_input("Type your message here...")
+    if user_input:
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        # Process and display assistant response
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                response, sources = process_message(
+                    query=user_input,
+                    llm=llm,
+                    index_name=INDEX_NAME,
+                    embeddings=embeddings
+                )
+                if isinstance(response, str):
+                    st.markdown(response)
+                    st.session_state.messages.append({
+                        "role": "assistant",
+                        "content": response
+                    })
+                    # Display sources
+                    display_sources(sources)
+                else:
+                    st.error("Received an invalid response format")
+    # Footer
+    st.markdown("---")
+    st.markdown(
+        "Built with ❤️ using Streamlit + LangChain + OpenAI",
+        help="An AI-powered chatbot with RAG capabilities"
+    )
+if __name__ == "__main__":
+    main()

old_scripts/streamlit-rag-app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import streamlit as st
+import os
+import json
+from dotenv import load_dotenv
+# from langchain.chains import RetrievalQA
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings, OpenAI
+from langchain.schema import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain_core.prompts import PromptTemplate
+# Load environment variables
+load_dotenv()
+# Get the OpenAI API key from the environment
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    st.error("OPENAI_API_KEY is not set. Please add it to your .env file.")
+# Initialize session state variables
+if 'vector_store' not in st.session_state:
+    st.session_state.vector_store = None
+# if 'qa_chain' not in st.session_state:
+#     st.session_state.qa_chain = None
+# def setup_qa_chain(vector_store):
+#     """Set up the QA chain with a retriever."""
+#     retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+#     llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
+#     qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
+#     return qa_chain
+prompt_template = PromptTemplate.from_template("Answer the following query based on a number of context documents Query:{query},Context:{context},Answer:")
+def main():
+    # Set page title and header
+    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
+    st.set_page_config(page_title="LibRAG", page_icon="📚")
+    st.title("Boston Public Library Database 📚")
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    # Sidebar for initialization
+    # st.sidebar.header("Initialize Knowledge Base")
+    # if st.sidebar.button("Load Data"):
+    #     try:
+    #         st.session_state.vector_store = FAISS.load_local(
+    #                     "vector-store", embeddings, allow_dangerous_deserialization=True
+    #                     )
+    #         st.session_state.qa_chain = setup_qa_chain(st.session_state.vector_store)
+    #         st.sidebar.success("Knowledge base loaded successfully!")
+    #     except Exception as e:
+    #         st.sidebar.error(f"Error loading data: {e}")
+    st.session_state.vector_store = FAISS.load_local("vector-store", embeddings, allow_dangerous_deserialization=True)
+    st.session_state.combine_docs_chain = create_stuff_documents_chain(llm, prompt_template)
+    st.session_stateretrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(search_kwargs={"k": 3}), combine_docs_chain)
+    # st.session_state.qa_chain = setup_qa_chain(st.session_state.vector_store)
+    # Query input and processing
+    st.header("Ask a Question")
+    query = st.text_input("Enter your question about BPL's database")
+    response = llm.invoke()
+    if query:
+        # Check if vector store and QA chain are initialized
+        if st.session_state.response is None:
+            st.warning("Please load the knowledge base first using the sidebar.")
+        else:
+            # Run the query
+            try:
+                st.session_state.response = retrieval_chain.invoke({"input": f"{query}"})
+                # Display answer
+                st.subheader("Answer")
+                st.write(response["result"])
+                # Display sources
+                st.subheader("Sources")
+                sources = response["source_documents"]
+                for i, doc in enumerate(sources, 1):
+                    with st.expander(f"Source {i}"):
+                        st.write(f"**Content:** {doc.page_content}")
+                        st.write(f"**URL:** {doc.metadata.get('url', 'No URL available')}")
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+if __name__ == "__main__":
+    main()

old_scripts/test_streamlit.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+import sys
+st.set_option('client.showErrorDetails', True)
+def main():
+    try:
+        st.title("Test App")
+        st.write("Hello World!")
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+        print(f"Error: {str(e)}", file=sys.stderr)
+if __name__ == "__main__":
+    main()