Spaces:

Boltuzamaki
/

Chat_with_youtube

Sleeping

App Files Files Community

Boltuzamaki commited on Sep 18, 2024

Commit

b7b243c

1 Parent(s): 525026c

init

Browse files

Files changed (5) hide show

.gitignore +162 -0
app.py +87 -0
src/__init__.py +0 -0
src/qna.py +118 -0
src/youtube_audio_loader.py +39 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+from datetime import datetime
+import streamlit as st
+from src.qna import ConversationalQA
+from src.youtube_audio_loader import youtube_transcriber
+if "store" not in st.session_state:
+    st.session_state.store = {}
+if "docs" not in st.session_state:
+    st.session_state.docs = None
+if "messages" not in st.session_state:
+    st.session_state.messages = {}
+st.set_page_config(page_title="YouTube Transcriber & Chatbot")
+st.sidebar.title("Configuration")
+openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
+os.environ["OPENAI_API_KEY"] = openai_api_key
+model = st.sidebar.selectbox(
+    "Model", options=["gpt-4o", "gpt-4o-mini"], index=0
+)
+use_whisper_api = st.sidebar.checkbox(
+    "Use Whisper API for Transcription", value=False
+)
+if use_whisper_api:
+    st.sidebar.warning("Using OpenAI Whisper API may incur costs.")
+    local = False
+else:
+    local = True
+st.title("YouTube Video Transcriber & Chatbot")
+youtube_link = st.text_input("Enter YouTube Video Link")
+if youtube_link:
+    st.video(youtube_link)
+# Transcription
+if st.button("Transcribe"):
+    if openai_api_key:
+        st.session_state.docs = youtube_transcriber(youtube_link, local=local)
+        st.session_state.messages = []
+        st.success("Transcription completed!")
+    else:
+        st.error("Please enter your OpenAI API key.")
+if st.session_state.docs:
+    qa_system = ConversationalQA(docs=st.session_state.docs)
+    st.write("### Ask me anything!")
+    def display_message(role, content, timestamp):
+        with st.chat_message(role):
+            st.markdown(f"**{role.capitalize()}:** {content}")
+            st.markdown(
+                f"<small><i>{timestamp}</i></small>", unsafe_allow_html=True
+            )
+    if st.session_state.messages:
+        for message in st.session_state.messages:
+            display_message(
+                message["role"], message["content"], message["timestamp"]
+            )
+    if prompt := st.chat_input("Your question here..."):
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        display_message("user", prompt, timestamp)
+        st.session_state.messages.append(
+            {"role": "user", "content": prompt, "timestamp": timestamp}
+        )
+        with st.spinner("Thinking..."):
+            response = qa_system.invoke_chain(
+                session_id="1", user_input=prompt
+            )
+            bot_response = response
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        display_message("bot", bot_response, timestamp)
+        st.session_state.messages.append(
+            {"role": "bot", "content": bot_response, "timestamp": timestamp}
+        )

src/__init__.py ADDED Viewed

File without changes

src/qna.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains.history_aware_retriever import (
+    create_history_aware_retriever,
+)
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_chroma import Chroma
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+class ConversationalQA:
+    """
+    A class that handles conversational question-answering using a
+    retrieval-augmented generation approach with session history and
+    document retrieval capabilities.
+    """
+    def __init__(
+        self,
+        docs: list,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ):
+        """
+        Initialize the ConversationalQA class with API key, documents, and
+        text splitting configurations.
+        :param openai_api_key: OpenAI API key to access LLM
+        :param docs: List of documents to be used for retrieval and answering
+        :param chunk_size: Maximum size of each text chunk for processing
+        :param chunk_overlap: Number of characters to overlap between chunks
+        """
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        self.splits = self.text_splitter.split_documents(docs)
+        self.llm = ChatOpenAI()
+        self.vectorstore = Chroma.from_documents(
+            documents=self.splits,
+            embedding=OpenAIEmbeddings(),
+            collection_name="youtube",
+        )
+        self.retriever = self.vectorstore.as_retriever()
+        self.qa_system_prompt = """You are an assistant for question-answering
+        tasks. Use the following pieces of retrieved context to answer the
+        question. If you don't know the answer, just say that you don't know.
+        Use three sentences maximum and keep the answer
+        concise.\n\n{context}"""
+        self.qa_prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", self.qa_system_prompt),
+                MessagesPlaceholder("chat_history"),
+                ("human", "{input}"),
+            ]
+        )
+        self.contextualize_q_system_prompt = """Given a chat history and the
+        latest user question which might reference context in the chat
+        history, formulate a standalone question which can be understood
+        without the chat history. Do NOT answer the question, just
+        reformulate it if needed and otherwise return it as is."""
+        self.contextualize_q_prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", self.contextualize_q_system_prompt),
+                MessagesPlaceholder("chat_history"),
+                ("human", "{input}"),
+            ]
+        )
+        self.question_answer_chain = create_stuff_documents_chain(
+            self.llm, self.qa_prompt
+        )
+        self.history_aware_chain = create_history_aware_retriever(
+            self.llm, self.retriever, self.contextualize_q_prompt
+        )
+        self.rag_chain = create_retrieval_chain(
+            self.history_aware_chain, self.question_answer_chain
+        )
+        self.store = {}
+    def get_session_history(self, session_id: str) -> BaseChatMessageHistory:
+        """
+        Retrieve or create a chat history for a given session ID.
+        :param session_id: Unique session identifier
+        :return: ChatMessageHistory object for the session
+        """
+        if session_id not in self.store:
+            self.store[session_id] = ChatMessageHistory()
+        return self.store[session_id]
+    def invoke_chain(self, session_id: str, user_input: str) -> str:
+        """
+        Invoke the conversational question-answering chain with user input
+        and session history.
+        :param session_id: Unique session identifier
+        :param user_input: User's question input
+        :return: Answer generated by the system
+        """
+        conversational_rag_chain = RunnableWithMessageHistory(
+            self.rag_chain,
+            self.get_session_history,
+            input_messages_key="input",
+            history_messages_key="chat_history",
+            output_messages_key="answer",
+        )
+        return conversational_rag_chain.invoke(
+            {"input": user_input},
+            config={"configurable": {"session_id": session_id}},
+        )["answer"]

src/youtube_audio_loader.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from langchain.document_loaders.parsers.audio import (
+    OpenAIWhisperParser,
+    OpenAIWhisperParserLocal,
+)
+from langchain_community.document_loaders.blob_loaders.youtube_audio import (
+    YoutubeAudioLoader,
+)
+from langchain_community.document_loaders.generic import GenericLoader
+def youtube_transcriber(youtube_video_link, local=True):
+    urls = [youtube_video_link]
+    save_dir = os.path.expanduser("~/Downloads/YouTube")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    if local:
+        loader = GenericLoader(
+            YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParserLocal()
+        )
+    else:
+        loader = GenericLoader(
+            YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser()
+        )
+    docs = loader.load()
+    for file_name in os.listdir(save_dir):
+        file_path = os.path.join(save_dir, file_name)
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+    if not os.listdir(save_dir):
+        os.rmdir(save_dir)
+    return docs