Spaces:
Runtime error
Runtime error
Commit
·
150092a
1
Parent(s):
bc3b8cd
Delete Web Application
Browse files- Web Application/.env +0 -1
- Web Application/10K_Annual_Reports/Alphabet.pdf +0 -0
- Web Application/10K_Annual_Reports/Amazon.pdf +0 -0
- Web Application/10K_Annual_Reports/Apple.pdf +0 -0
- Web Application/10K_Annual_Reports/Meta.pdf +0 -3
- Web Application/10K_Annual_Reports/Microsoft.pdf +0 -3
- Web Application/10K_Annual_Reports/Netflix.pdf +0 -3
- Web Application/10K_Annual_Reports/Tesla.pdf +0 -3
- Web Application/Dockerfile +0 -22
- Web Application/Pipfile +0 -21
- Web Application/Pipfile.lock +0 -0
- Web Application/VectorStoreIndex.zip +0 -3
- Web Application/VectorStoreIndex/chroma-collections.parquet +0 -3
- Web Application/VectorStoreIndex/chroma-embeddings.parquet +0 -3
- Web Application/VectorStoreIndex/index/id_to_uuid_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl +0 -3
- Web Application/VectorStoreIndex/index/index_4687da76-fa8c-47cd-96a2-c9f3fc08313a.bin +0 -3
- Web Application/VectorStoreIndex/index/index_metadata_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl +0 -3
- Web Application/VectorStoreIndex/index/uuid_to_id_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl +0 -3
- Web Application/__pycache__/vectorstore.cpython-310.pyc +0 -0
- Web Application/app.py +0 -46
- Web Application/requirements.txt +0 -7
- Web Application/vectorstore.py +0 -86
Web Application/.env
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
OPENAI_API_KEY="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"
|
|
|
|
Web Application/10K_Annual_Reports/Alphabet.pdf
DELETED
The diff for this file is too large to render.
See raw diff
|
|
Web Application/10K_Annual_Reports/Amazon.pdf
DELETED
The diff for this file is too large to render.
See raw diff
|
|
Web Application/10K_Annual_Reports/Apple.pdf
DELETED
The diff for this file is too large to render.
See raw diff
|
|
Web Application/10K_Annual_Reports/Meta.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4a2bebf058c6e947c09f9fdb510010a92f6698b458941956ad0bbdaa043ae6de
|
3 |
-
size 1111637
|
|
|
|
|
|
|
|
Web Application/10K_Annual_Reports/Microsoft.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:50507a219c93a452c1a15e1c5bb5d01d53a97d75c1ce91ea0a9703ef7debca95
|
3 |
-
size 1547825
|
|
|
|
|
|
|
|
Web Application/10K_Annual_Reports/Netflix.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d95d9d4a03473863582a234e8edfd97eac97f1be9e552f9467e95dd8ce61280e
|
3 |
-
size 1410523
|
|
|
|
|
|
|
|
Web Application/10K_Annual_Reports/Tesla.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3a2bfbae724f9f4a7b28539993ca79c54db6ead3ff5105693a546e5a2134bbde
|
3 |
-
size 2659773
|
|
|
|
|
|
|
|
Web Application/Dockerfile
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
FROM python:3.9
|
2 |
-
|
3 |
-
RUN pip install virtualenv && virtualenv venv -p python3
|
4 |
-
ENV VIRTUAL_ENV=/venv
|
5 |
-
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
6 |
-
|
7 |
-
WORKDIR /app
|
8 |
-
COPY requirements.txt ./
|
9 |
-
RUN pip install -r requirements.txt
|
10 |
-
|
11 |
-
RUN git clone https://github.com/facebookresearch/detectron2.git
|
12 |
-
RUN python -m pip install -e detectron2
|
13 |
-
|
14 |
-
# Install dependencies
|
15 |
-
RUN apt-get update && apt-get install libgl1 -y
|
16 |
-
RUN pip install -U nltk
|
17 |
-
RUN [ "python3", "-c", "import nltk; nltk.download('punkt', download_dir='/usr/local/nltk_data')" ]
|
18 |
-
|
19 |
-
COPY . /app
|
20 |
-
|
21 |
-
# Run the application:
|
22 |
-
CMD ["python", "-u", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Web Application/Pipfile
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
[[source]]
|
2 |
-
url = "https://pypi.org/simple"
|
3 |
-
verify_ssl = true
|
4 |
-
name = "pypi"
|
5 |
-
|
6 |
-
[packages]
|
7 |
-
langchain = "*"
|
8 |
-
openai = "*"
|
9 |
-
pybind11 = "*"
|
10 |
-
chromadb = "*"
|
11 |
-
cython = "*"
|
12 |
-
unstructured = {extras = ["local-inference"], version = "*"}
|
13 |
-
layoutparser = {extras = ["layoutmodels", "tesseract"], version = "*"}
|
14 |
-
pytesseract = "*"
|
15 |
-
pillow = "==9.0.0"
|
16 |
-
tiktoken = "*"
|
17 |
-
|
18 |
-
[dev-packages]
|
19 |
-
|
20 |
-
[requires]
|
21 |
-
python_version = "3.11"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Web Application/Pipfile.lock
DELETED
The diff for this file is too large to render.
See raw diff
|
|
Web Application/VectorStoreIndex.zip
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:21120b3b81f1396478c8c377dcebe5a686ee501de7b461a3bf198f8da0eef09c
|
3 |
-
size 106261438
|
|
|
|
|
|
|
|
Web Application/VectorStoreIndex/chroma-collections.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:250cf833cc54545b03d2454a5ff23eda3e047f8a3c465d29243f2e697b095848
|
3 |
-
size 557
|
|
|
|
|
|
|
|
Web Application/VectorStoreIndex/chroma-embeddings.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:32c73aa28836865bbc6964cb3f8f0a540b9639828e39b6fa3c4ae0cb7fc7a1a3
|
3 |
-
size 114611418
|
|
|
|
|
|
|
|
Web Application/VectorStoreIndex/index/id_to_uuid_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c839634ab7858bf13401325e4055d5a3df0dcd5984705ecd5d83a79966363e0e
|
3 |
-
size 150307
|
|
|
|
|
|
|
|
Web Application/VectorStoreIndex/index/index_4687da76-fa8c-47cd-96a2-c9f3fc08313a.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4133367b79c8a0bed4a21a4885f7d35008f9bc69c9fd0b513eafcfb59faddb0b
|
3 |
-
size 29136520
|
|
|
|
|
|
|
|
Web Application/VectorStoreIndex/index/index_metadata_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4be5c2f38188c24bc82c3ae21db9bcbf876838e71e8b95c31435a90f960c26f2
|
3 |
-
size 74
|
|
|
|
|
|
|
|
Web Application/VectorStoreIndex/index/uuid_to_id_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ecdc7842716c5b96c8438096e1d1f5a276da5742cf13ea2101f83de45c0f5456
|
3 |
-
size 175727
|
|
|
|
|
|
|
|
Web Application/__pycache__/vectorstore.cpython-310.pyc
DELETED
Binary file (4.1 kB)
|
|
Web Application/app.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
import os, gradio
|
2 |
-
from langchain.document_loaders import UnstructuredPDFLoader
|
3 |
-
from langchain.indexes import VectorstoreIndexCreator
|
4 |
-
from vectorstore import VectorstoreIndexCreator
|
5 |
-
|
6 |
-
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
|
7 |
-
|
8 |
-
text_folder = '10K_Annual_Reports'
|
9 |
-
loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]
|
10 |
-
|
11 |
-
# Create the index, if it does not exist, and save it
|
12 |
-
if not os.path.isfile('VectorStoreIndex/chroma-embeddings.parquet'):
|
13 |
-
from langchain.vectorstores import Chroma
|
14 |
-
index = VectorstoreIndexCreator(vectorstore_cls=Chroma, vectorstore_kwargs={ "persist_directory": "VectorStoreIndex/"}).from_loaders(loaders)
|
15 |
-
index.vectorstore.persist()
|
16 |
-
|
17 |
-
# Load the saved index
|
18 |
-
index_saved = VectorstoreIndexCreator().from_persistent_index("VectorStoreIndex/")
|
19 |
-
|
20 |
-
description = """This is an AI conversational agent where you provide it with the annual reports of companies, and it can study it and answer any questions
|
21 |
-
you have about it. Currently, the LLM has been trained on the following companies' 10-K reports: Amazon, Apple, Alphabet (Google), Meta (Facebook), Microsoft,
|
22 |
-
Netflix and Tesla.' I plan to include more companies' 10-K reports in future.
|
23 |
-
|
24 |
-
Once the LLM is trained on a new 10-K report, it stores the vector embeddings of the document locally using ChromaDB to make the querying faster and also to
|
25 |
-
save time and money on creating the vector embeddings for the same document in future.
|
26 |
-
|
27 |
-
The LLM's universe is only the 10-K reports it has been trained on; it cannot pull information from the internet. So, you can ask it about anything that's
|
28 |
-
contained in their 10-K reports. If it cannot find an answer to your query within the 10-K reports, it will reply with "I don't know". Some example of questions
|
29 |
-
you can ask are:
|
30 |
-
|
31 |
-
- What are the risks for Tesla?
|
32 |
-
- What was Google's earnings for the last fiscal year?
|
33 |
-
- Who are the competetors of Apple?
|
34 |
-
|
35 |
-
An example of querying about something the LLM's training did not include:
|
36 |
-
|
37 |
-
- Query: "What is Tesco?"
|
38 |
-
- Response: " Tesco is not mentioned in the context, so I don't know."
|
39 |
-
"""
|
40 |
-
|
41 |
-
def chat_response(query):
|
42 |
-
return index_saved.query(query)
|
43 |
-
|
44 |
-
interface = gradio.Interface(fn=chat_response, inputs="text", outputs="text", title='Annual Reports GPT', description=description)
|
45 |
-
|
46 |
-
interface.launch() #server_name="0.0.0.0", server_port=8080, share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Web Application/requirements.txt
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
gradio
|
2 |
-
langchain
|
3 |
-
unstructured
|
4 |
-
openai
|
5 |
-
chromadb
|
6 |
-
unstructured
|
7 |
-
tiktoken
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Web Application/vectorstore.py
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
from typing import Any, List, Optional, Type
|
2 |
-
|
3 |
-
from pydantic import BaseModel, Extra, Field
|
4 |
-
|
5 |
-
from langchain.base_language import BaseLanguageModel
|
6 |
-
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
7 |
-
from langchain.chains.retrieval_qa.base import RetrievalQA
|
8 |
-
from langchain.document_loaders.base import BaseLoader
|
9 |
-
from langchain.embeddings.base import Embeddings
|
10 |
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
11 |
-
from langchain.llms.openai import OpenAI
|
12 |
-
from langchain.schema import Document
|
13 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
14 |
-
from langchain.vectorstores.base import VectorStore
|
15 |
-
from langchain.vectorstores.chroma import Chroma
|
16 |
-
|
17 |
-
|
18 |
-
def _get_default_text_splitter() -> TextSplitter:
|
19 |
-
return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
20 |
-
|
21 |
-
|
22 |
-
class VectorStoreIndexWrapper(BaseModel):
|
23 |
-
"""Wrapper around a vectorstore for easy access."""
|
24 |
-
|
25 |
-
vectorstore: VectorStore
|
26 |
-
|
27 |
-
class Config:
|
28 |
-
"""Configuration for this pydantic object."""
|
29 |
-
|
30 |
-
extra = Extra.forbid
|
31 |
-
arbitrary_types_allowed = True
|
32 |
-
|
33 |
-
def query(
|
34 |
-
self, question: str, llm: Optional[BaseLanguageModel] = None, **kwargs: Any
|
35 |
-
) -> str:
|
36 |
-
"""Query the vectorstore."""
|
37 |
-
llm = llm or OpenAI(temperature=0)
|
38 |
-
chain = RetrievalQA.from_chain_type(
|
39 |
-
llm, retriever=self.vectorstore.as_retriever(), **kwargs
|
40 |
-
)
|
41 |
-
return chain.run(question)
|
42 |
-
|
43 |
-
def query_with_sources(
|
44 |
-
self, question: str, llm: Optional[BaseLanguageModel] = None, **kwargs: Any
|
45 |
-
) -> dict:
|
46 |
-
"""Query the vectorstore and get back sources."""
|
47 |
-
llm = llm or OpenAI(temperature=0)
|
48 |
-
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
49 |
-
llm, retriever=self.vectorstore.as_retriever(), **kwargs
|
50 |
-
)
|
51 |
-
return chain({chain.question_key: question})
|
52 |
-
|
53 |
-
|
54 |
-
class VectorstoreIndexCreator(BaseModel):
|
55 |
-
"""Logic for creating indexes."""
|
56 |
-
|
57 |
-
vectorstore_cls: Type[VectorStore] = Chroma
|
58 |
-
embedding: Embeddings = Field(default_factory=OpenAIEmbeddings)
|
59 |
-
text_splitter: TextSplitter = Field(default_factory=_get_default_text_splitter)
|
60 |
-
vectorstore_kwargs: dict = Field(default_factory=dict)
|
61 |
-
|
62 |
-
class Config:
|
63 |
-
"""Configuration for this pydantic object."""
|
64 |
-
|
65 |
-
extra = Extra.forbid
|
66 |
-
arbitrary_types_allowed = True
|
67 |
-
|
68 |
-
def from_loaders(self, loaders: List[BaseLoader]) -> VectorStoreIndexWrapper:
|
69 |
-
"""Create a vectorstore index from loaders."""
|
70 |
-
docs = []
|
71 |
-
for loader in loaders:
|
72 |
-
docs.extend(loader.load())
|
73 |
-
return self.from_documents(docs)
|
74 |
-
|
75 |
-
def from_documents(self, documents: List[Document]) -> VectorStoreIndexWrapper:
|
76 |
-
"""Create a vectorstore index from documents."""
|
77 |
-
sub_docs = self.text_splitter.split_documents(documents)
|
78 |
-
vectorstore = self.vectorstore_cls.from_documents(
|
79 |
-
sub_docs, self.embedding, **self.vectorstore_kwargs
|
80 |
-
)
|
81 |
-
return VectorStoreIndexWrapper(vectorstore=vectorstore)
|
82 |
-
|
83 |
-
def from_persistent_index(self, path: str) -> VectorStoreIndexWrapper:
|
84 |
-
"""Load a vectorstore index from a persistent index."""
|
85 |
-
vectorstore = self.vectorstore_cls(persist_directory=path, embedding_function=self.embedding)
|
86 |
-
return VectorStoreIndexWrapper(vectorstore=vectorstore)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|