hamxahbhattii commited on
Commit
6330947
1 Parent(s): 04be7ab

added Jine

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ___pycache__
2
+ Data/
3
+ Front-end/
4
+ .env
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+
4
+ WORKDIR /code
5
+
6
+ COPY ./requirements.txt /code/requirements.txt
7
+
8
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
9
+
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV HOME=/home/user \
13
+ PATH=/home/user/.local/bin:$PATH
14
+
15
+ WORKDIR $HOME/app
16
+
17
+ COPY --chown=user . $HOME/app
18
+
19
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
Langchain_bot.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Logs/chatbot.log ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,10 +1,2 @@
1
- ---
2
- title: Jin E
3
- emoji: 👀
4
- colorFrom: red
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # JIN-e
2
+ This is a bot based on Chatgpt using langchain to answers Questions related to Policies.
 
 
 
 
 
 
 
 
Requirements Documents/Requirement specification Questionier.docx ADDED
Binary file (498 kB). View file
 
Vector Store/chroma.sqlite3 ADDED
Binary file (127 kB). View file
 
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+
3
+ # Import the Jine class and other necessary modules
4
+ from jine import Jine # Replace 'your_module_name' with the actual module name
5
+ from pydantic import BaseModel
6
+
7
+ # Load your environment variables
8
+ from dotenv import load_dotenv
9
+ import os
10
+
11
+
12
+ load_dotenv()
13
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
+ DATA_DIRECTORY = os.getenv("DATA_DIRECTORY")
15
+ VECTOR_STORE_DIRECTORY = os.getenv("VECTOR_STORE_DIRCTORY")
16
+ VECTOR_STORE_CHECK = os.getenv("VECTOR_STORE_CHECK")
17
+ DEBUG = os.getenv("DEBUG")
18
+
19
+ # Initialize Jine
20
+ jine = Jine(OPENAI_API_KEY, VECTOR_STORE_DIRECTORY, VECTOR_STORE_CHECK, DATA_DIRECTORY, DEBUG)
21
+ jine.load_model()
22
+
23
+ # Create a FastAPI app
24
+ app = FastAPI()
25
+
26
+ # Define a request model
27
+ class ChatRequest(BaseModel):
28
+ user_question: str
29
+
30
+ # Define a response model
31
+ class ChatResponse(BaseModel):
32
+ user_question: str
33
+ chatbot_response: str
34
+
35
+ # Define the chatbot endpoint
36
+ @app.post("/chatbot/")
37
+ def chat_with_bot(request: ChatRequest):
38
+ user_question = request.user_question
39
+ chatbot_response = jine.chat(user_question)
40
+ return ChatResponse(user_question=user_question, chatbot_response=chatbot_response)
chainlit_interface.py ADDED
File without changes
environment.yml ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: jine
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - asttokens=2.4.0=pyhd8ed1ab_0
7
+ - backcall=0.2.0=pyh9f0ad1d_0
8
+ - backports=1.0=pyhd8ed1ab_3
9
+ - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0
10
+ - bzip2=1.0.8=he774522_0
11
+ - ca-certificates=2023.7.22=h56e8100_0
12
+ - colorama=0.4.6=pyhd8ed1ab_0
13
+ - comm=0.1.4=pyhd8ed1ab_0
14
+ - debugpy=1.6.7=py310hd77b12b_0
15
+ - decorator=5.1.1=pyhd8ed1ab_0
16
+ - exceptiongroup=1.1.3=pyhd8ed1ab_0
17
+ - executing=1.2.0=pyhd8ed1ab_0
18
+ - importlib-metadata=6.8.0=pyha770c72_0
19
+ - importlib_metadata=6.8.0=hd8ed1ab_0
20
+ - ipykernel=6.25.2=pyh60829e3_0
21
+ - ipython=8.16.1=pyh5737063_0
22
+ - jedi=0.19.1=pyhd8ed1ab_0
23
+ - jupyter_client=8.4.0=pyhd8ed1ab_0
24
+ - jupyter_core=5.4.0=py310h5588dad_0
25
+ - libffi=3.4.4=hd77b12b_0
26
+ - libsodium=1.0.18=h8d14728_1
27
+ - matplotlib-inline=0.1.6=pyhd8ed1ab_0
28
+ - nest-asyncio=1.5.8=pyhd8ed1ab_0
29
+ - openssl=1.1.1l=h8ffe710_0
30
+ - packaging=23.2=pyhd8ed1ab_0
31
+ - parso=0.8.3=pyhd8ed1ab_0
32
+ - pickleshare=0.7.5=py_1003
33
+ - pip=23.3=py310haa95532_0
34
+ - platformdirs=3.11.0=pyhd8ed1ab_0
35
+ - prompt-toolkit=3.0.39=pyha770c72_0
36
+ - prompt_toolkit=3.0.39=hd8ed1ab_0
37
+ - psutil=5.9.0=py310h2bbff1b_0
38
+ - pure_eval=0.2.2=pyhd8ed1ab_0
39
+ - pygments=2.16.1=pyhd8ed1ab_0
40
+ - python=3.10.0=h96c0403_3
41
+ - python-dateutil=2.8.2=pyhd8ed1ab_0
42
+ - python_abi=3.10=2_cp310
43
+ - pyzmq=23.2.1=py310h73ada01_0
44
+ - setuptools=68.0.0=py310haa95532_0
45
+ - six=1.16.0=pyh6c4a22f_0
46
+ - sqlite=3.41.2=h2bbff1b_0
47
+ - stack_data=0.6.2=pyhd8ed1ab_0
48
+ - tk=8.6.12=h2bbff1b_0
49
+ - tornado=6.2=py310he2412df_0
50
+ - traitlets=5.11.2=pyhd8ed1ab_0
51
+ - typing-extensions=4.8.0=hd8ed1ab_0
52
+ - typing_extensions=4.8.0=pyha770c72_0
53
+ - vc=14.2=h21ff451_1
54
+ - vs2015_runtime=14.27.29016=h5e58377_2
55
+ - wcwidth=0.2.8=pyhd8ed1ab_0
56
+ - wheel=0.41.2=py310haa95532_0
57
+ - xz=5.4.2=h8cc25b3_0
58
+ - zeromq=4.3.4=h0e60522_1
59
+ - zipp=3.17.0=pyhd8ed1ab_0
60
+ - zlib=1.2.13=h8cc25b3_0
61
+ - pip:
62
+ - aiofiles==23.2.1
63
+ - aiohttp==3.8.6
64
+ - aiosignal==1.3.1
65
+ - annotated-types==0.6.0
66
+ - antlr4-python3-runtime==4.9.3
67
+ - anyio==3.7.1
68
+ - async-timeout==4.0.3
69
+ - asyncer==0.0.2
70
+ - attrs==23.1.0
71
+ - backoff==2.2.1
72
+ - beautifulsoup4==4.12.2
73
+ - bidict==0.22.1
74
+ - certifi==2023.7.22
75
+ - cffi==1.16.0
76
+ - chainlit==0.7.301
77
+ - chardet==5.2.0
78
+ - charset-normalizer==3.3.0
79
+ - click==8.1.7
80
+ - contourpy==1.1.1
81
+ - cryptography==41.0.4
82
+ - cycler==0.12.1
83
+ - dataclasses-json==0.5.14
84
+ - deprecated==1.2.14
85
+ - effdet==0.4.1
86
+ - emoji==2.8.0
87
+ - fastapi==0.99.1
88
+ - fastapi-socketio==0.0.10
89
+ - filelock==3.12.4
90
+ - filetype==1.2.0
91
+ - flatbuffers==23.5.26
92
+ - fonttools==4.43.1
93
+ - frozenlist==1.4.0
94
+ - fsspec==2023.10.0
95
+ - googleapis-common-protos==1.61.0
96
+ - greenlet==3.0.0
97
+ - grpcio==1.59.0
98
+ - h11==0.14.0
99
+ - httpcore==0.18.0
100
+ - httptools==0.6.1
101
+ - httpx==0.25.0
102
+ - huggingface-hub==0.17.3
103
+ - humanfriendly==10.0
104
+ - idna==3.4
105
+ - importlib-resources==6.1.0
106
+ - iopath==0.1.10
107
+ - jinja2==3.1.2
108
+ - joblib==1.3.2
109
+ - jsonpatch==1.33
110
+ - jsonpointer==2.4
111
+ - kiwisolver==1.4.5
112
+ - langchain==0.0.320
113
+ - langdetect==1.0.9
114
+ - langsmith==0.0.49
115
+ - layoutparser==0.3.4
116
+ - lazify==0.4.0
117
+ - lxml==4.9.3
118
+ - markupsafe==2.1.3
119
+ - marshmallow==3.20.1
120
+ - matplotlib==3.8.0
121
+ - monotonic==1.6
122
+ - mpmath==1.3.0
123
+ - multidict==6.0.4
124
+ - mypy-extensions==1.0.0
125
+ - networkx==3.2
126
+ - nltk==3.8.1
127
+ - nodeenv==1.8.0
128
+ - numpy==1.26.1
129
+ - omegaconf==2.3.0
130
+ - onnx==1.14.1
131
+ - openai==0.28.1
132
+ - opencv-python==4.8.1.78
133
+ - opentelemetry-api==1.20.0
134
+ - opentelemetry-exporter-otlp==1.20.0
135
+ - opentelemetry-exporter-otlp-proto-common==1.20.0
136
+ - opentelemetry-exporter-otlp-proto-grpc==1.20.0
137
+ - opentelemetry-exporter-otlp-proto-http==1.20.0
138
+ - opentelemetry-instrumentation==0.41b0
139
+ - opentelemetry-proto==1.20.0
140
+ - opentelemetry-sdk==1.20.0
141
+ - opentelemetry-semantic-conventions==0.41b0
142
+ - overrides==7.4.0
143
+ - pdf2image==1.16.3
144
+ - pdfminer-six==20221105
145
+ - pdfplumber==0.10.2
146
+ - pillow==10.1.0
147
+ - portalocker==2.8.2
148
+ - prisma==0.10.0
149
+ - protobuf==4.24.4
150
+ - pulsar-client==3.3.0
151
+ - pycocotools==2.0.7
152
+ - pycparser==2.21
153
+ - pydantic==1.10.13
154
+ - pydantic-core==2.10.1
155
+ - pyjwt==2.8.0
156
+ - pymupdf==1.23.5
157
+ - pymupdfb==1.23.5
158
+ - pyparsing==3.1.1
159
+ - pypdfium2==4.22.0
160
+ - pypika==0.48.9
161
+ - pyreadline3==3.4.1
162
+ - pytesseract==0.3.10
163
+ - python-docx==1.0.1
164
+ - python-dotenv==1.0.0
165
+ - python-engineio==4.8.0
166
+ - python-graphql-client==0.4.3
167
+ - python-iso639==2023.6.15
168
+ - python-magic==0.4.27
169
+ - python-multipart==0.0.6
170
+ - python-socketio==5.10.0
171
+ - pytz==2023.3.post1
172
+ - pywin32==306
173
+ - pyyaml==6.0.1
174
+ - rank-bm25==0.2.2
175
+ - rapidfuzz==3.4.0
176
+ - regex==2023.10.3
177
+ - requests==2.31.0
178
+ - safetensors==0.4.0
179
+ - scipy==1.11.3
180
+ - simple-websocket==1.0.0
181
+ - sniffio==1.3.0
182
+ - soupsieve==2.5
183
+ - sqlalchemy==2.0.22
184
+ - sympy==1.12
185
+ - syncer==2.0.3
186
+ - tabulate==0.9.0
187
+ - tenacity==8.2.3
188
+ - tiktoken==0.5.1
189
+ - timm==0.9.8
190
+ - tokenizers==0.14.1
191
+ - tomli==2.0.1
192
+ - tomlkit==0.12.1
193
+ - torch==2.1.0
194
+ - torchvision==0.16.0
195
+ - tqdm==4.66.1
196
+ - transformers==4.34.1
197
+ - typing-inspect==0.9.0
198
+ - tzdata==2023.3
199
+ - unstructured==0.10.25
200
+ - unstructured-inference==0.7.9
201
+ - unstructured-pytesseract==0.3.12
202
+ - uptrace==1.20.2
203
+ - urllib3==2.0.7
204
+ - uvicorn==0.23.2
205
+ - watchfiles==0.20.0
206
+ - websockets==11.0.3
207
+ - wrapt==1.15.0
208
+ - wsproto==1.2.0
209
+ - yarl==1.9.2
210
+ prefix: D:\anaconda3\envs\jine
jine.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.document_loaders import DirectoryLoader
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.retrievers.multi_query import MultiQueryRetriever
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.chains import RetrievalQA
11
+ from dotenv import load_dotenv
12
+
13
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
14
+ from langchain.llms import OpenAI
15
+ from langchain.embeddings import OpenAIEmbeddings
16
+ from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
17
+
18
+ ## Setting up Log configuration
19
+ logging.basicConfig(
20
+ filename='Logs/chatbot.log', # Name of the log file
21
+ level=logging.INFO, # Logging level (you can use logging.DEBUG for more detailed logs)
22
+ format='%(asctime)s - %(levelname)s - %(message)s'
23
+ )
24
+
25
+ class Jine:
26
+
27
+ def __init__(self, OPENAI_API_KEY, VECTOR_STORE_DIRECTORY, VECTOR_STORE_CHECK, DATA_DIRECTORY, DEBUG,USE_HYDE=False):
28
+ self.OPENAI_API_KEY = OPENAI_API_KEY
29
+ self.DATA_DIRECTORY = DATA_DIRECTORY
30
+ self.VECTOR_STORE_DIRECTORY = VECTOR_STORE_DIRECTORY
31
+ self.VECTOR_STORE_CHECK = VECTOR_STORE_CHECK
32
+ # self.DEBUG = DEBUG
33
+ self.vectorstore = None
34
+ self.bot = None
35
+
36
+ def create_vectorstore(self):
37
+
38
+ if self.VECTOR_STORE_CHECK:
39
+ print("Loading Vectorstore")
40
+ self.load_vectorstore()
41
+ else:
42
+ print("Creating Vectorstore")
43
+ docs = DirectoryLoader(self.DATA_DIRECTORY).load()
44
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=10)
45
+ all_splits = text_splitter.split_documents(docs)
46
+ self.vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings(),
47
+ persist_directory=self.VECTOR_STORE_DIRECTORY)
48
+
49
+
50
+ def load_vectorstore(self):
51
+ self.vectorstore = Chroma(persist_directory=self.VECTOR_STORE_DIRECTORY, embedding_function=OpenAIEmbeddings())
52
+
53
+
54
+ def log(self, user_question, chatbot_reply):
55
+ # Log the user's question
56
+ logging.info(f"User: {user_question}")
57
+ # Log the chatbot's reply
58
+ logging.info(f"JIN-e: {chatbot_reply}")
59
+
60
+ def load_model(self):
61
+ self.create_vectorstore()
62
+ self.create_ensemble_retriever()
63
+
64
+ def chat(self, user_question):
65
+ result = self.bot({"query": user_question})
66
+ response = result["result"]
67
+ self.log(user_question, response)
68
+ return response
69
+
70
+ ### Adding Ensemble retriver
71
+ def create_ensemble_retriever(self):
72
+ template = """
73
+ You are an Expert Policy Advisor.These Below are the Documents that are extracted from the different Policies.Your Job
74
+ is to Provide the Answer to below question based on the text below.
75
+ Here are few instructions for you to follow when answering a question.
76
+ - When you didnt find the relevant answers from below text Just Say "I dont know this,Please contact your HRBP for more details."
77
+ - These are policy Documents, When answering a question Do Not return in response that "This information is At Annex A/B".Provide a Complete response to request.
78
+ - Try to answer the questions in bullet format if possible.
79
+ - Use three sentences maximum to Answer the question in very concise manner
80
+
81
+ {context}
82
+ Question: {question}
83
+ Helpful Answer:
84
+ """
85
+
86
+ QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
87
+ print("====================="*10)
88
+ print("Loading Documents for Ensemble Retriver")
89
+ print("====================="*10)
90
+
91
+ docs = DirectoryLoader(self.DATA_DIRECTORY).load()
92
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=10)
93
+ # all_splits = text_splitter.split_documents(docs)
94
+
95
+ bm25_retriever = BM25Retriever.from_documents(docs)
96
+ # GEttting only two relevant documents
97
+ bm25_retriever.k = 2
98
+ ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,
99
+ self.vectorstore.as_retriever(search_kwargs={"k": 2})],
100
+ weights=[0.5, 0.5])
101
+
102
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
103
+
104
+ self.bot = RetrievalQA.from_chain_type(
105
+ llm,
106
+ retriever=ensemble_retriever,
107
+ chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
108
+
109
+
110
+
111
+ if __name__ == "__main__":
112
+ # Set your configuration here
113
+ load_dotenv()
114
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
115
+ DATA_DIRECTORY = os.getenv("DATA_DIRECTORY")
116
+ VECTOR_STORE_DIRECTORY = os.getenv("VECTOR_STORE_DIRCTORY")
117
+ VECTOR_STORE_CHECK = os.getenv("VECTOR_STORE_CHECK")
118
+
119
+ DEBUG = os.getenv("DEBUG")
120
+ USE_HYDE = os.getenv("USE_HYDE")
121
+ # Initialize Jine and start chatting
122
+ jine = Jine(OPENAI_API_KEY, VECTOR_STORE_DIRECTORY, VECTOR_STORE_CHECK, DATA_DIRECTORY, DEBUG)
123
+ # print(jine.VECTOR_STORE_CHECK)
124
+ jine.load_model()
125
+ while True:
126
+ user_question = input("You: ")
127
+ if user_question.lower() in ["exit", "quit"]:
128
+ break
129
+ response = jine.chat(user_question)
130
+ print("JIN-e:", response)
jine_v1.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.document_loaders import DirectoryLoader
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.retrievers.multi_query import MultiQueryRetriever
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.chains import RetrievalQA
11
+ from dotenv import load_dotenv
12
+
13
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
14
+ from langchain.llms import OpenAI
15
+ from langchain.embeddings import OpenAIEmbeddings
16
+ from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
17
+
18
+ ## Setting up Log configuration
19
+ logging.basicConfig(
20
+ filename='Logs/chatbot.log', # Name of the log file
21
+ level=logging.INFO, # Logging level (you can use logging.DEBUG for more detailed logs)
22
+ format='%(asctime)s - %(levelname)s - %(message)s'
23
+ )
24
+
25
+
26
+ class Jine:
27
+
28
+ def __init__(self, OPENAI_API_KEY, VECTOR_STORE_DIRECTORY, VECTOR_STORE_CHECK, DATA_DIRECTORY, DEBUG,USE_HYDE=False):
29
+ self.OPENAI_API_KEY = OPENAI_API_KEY
30
+ self.DATA_DIRECTORY = DATA_DIRECTORY
31
+ self.VECTOR_STORE_DIRECTORY = VECTOR_STORE_DIRECTORY
32
+ self.VECTOR_STORE_CHECK = VECTOR_STORE_CHECK
33
+ self.DEBUG = DEBUG
34
+ self.vectorstore = None
35
+ self.bot = None
36
+ self.USE_HYDE = USE_HYDE
37
+ # creating this variable for BM25 Retriver.
38
+ # self.docs = None
39
+
40
+ def create_vectorstore(self):
41
+
42
+ if self.VECTOR_STORE_CHECK:
43
+ print("Loading Vectorstore")
44
+ self.load_vectorstore()
45
+ print('im running')
46
+ else:
47
+ print("Creating Vectorstore")
48
+ docs = DirectoryLoader(self.DATA_DIRECTORY).load()
49
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=10)
50
+ all_splits = text_splitter.split_documents(docs)
51
+ if self.USE_HYDE:
52
+ base_embeddings = OpenAIEmbeddings()
53
+ llm = OpenAI()
54
+ embeddings_hyde = HypotheticalDocumentEmbedder.from_llm(llm, base_embeddings, "web_search")
55
+ self.vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings_hyde,
56
+ persist_directory=self.VECTOR_STORE_DIRECTORY)
57
+
58
+ else:
59
+ self.vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(),
60
+ persist_directory=self.VECTOR_STORE_DIRECTORY)
61
+
62
+
63
+ def multi_query_retriever(self):
64
+ retriever_from_llm = MultiQueryRetriever.from_llm(retriever=self.vectorstore.as_retriever(),
65
+ llm=ChatOpenAI(temperature=0))
66
+ template = """Use the following pieces of context to answer the question at the end.
67
+ If you don't know the answer, just say that "i am unable to answer your query, for more information contact your HRBP", don't try to make up an answer.
68
+ Use three sentences maximum and keep the answer as concise as possible.
69
+ {context}
70
+ Question: {question}
71
+ Helpful Answer:"""
72
+ QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
73
+
74
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
75
+
76
+ self.bot = RetrievalQA.from_chain_type(
77
+ llm,
78
+ retriever=retriever_from_llm,
79
+ chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
80
+ )
81
+
82
+
83
+ def single_query_retriever(self):
84
+ template = """Use the following pieces of context to answer the question at the end.
85
+ If you don't know the answer, just say that "i am unable to answer your query, for more information contact your HRBP", don't try to make up an answer.
86
+ Use three sentences maximum and keep the answer as concise as possible.
87
+ {context}
88
+ Question: {question}
89
+ Helpful Answer:"""
90
+ QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
91
+
92
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
93
+
94
+ self.bot = RetrievalQA.from_chain_type(
95
+ llm,
96
+ retriever=self.vectorstore.as_retriever(),
97
+ chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
98
+
99
+ def load_vectorstore(self):
100
+ if self.USE_HYDE:
101
+ print("Using HYDE embeddings vectorstore")
102
+ base_embeddings = OpenAIEmbeddings()
103
+ llm = OpenAI()
104
+ embeddings_hyde = HypotheticalDocumentEmbedder.from_llm(llm, base_embeddings, "web_search")
105
+ self.vectorstore = Chroma(persist_directory=self.VECTOR_STORE_DIRECTORY, embedding_function=embeddings_hyde)
106
+ else:
107
+ print("Using Simple embeddings vectorstore")
108
+ self.vectorstore = Chroma(persist_directory=self.VECTOR_STORE_DIRECTORY, embedding_function=OpenAIEmbeddings())
109
+
110
+ def log(self, user_question, chatbot_reply):
111
+ # Log the user's question
112
+ logging.info(f"User: {user_question}")
113
+ # Log the chatbot's reply
114
+ logging.info(f"JIN-e: {chatbot_reply}")
115
+
116
+ def load_model(self):
117
+ self.create_vectorstore()
118
+ # self.multi_query_retriever()
119
+ # self.single_query_retriever()
120
+ self.create_ensemble_retriever()
121
+
122
+ def chat(self, user_question):
123
+ result = self.bot({"query": user_question})
124
+ response = result["result"]
125
+ self.log(user_question, response)
126
+ return response
127
+
128
+ ### Adding Ensemble retriver
129
+ def create_ensemble_retriever(self):
130
+ template = """Use the following pieces of context to answer the question at the end.
131
+ If you don't know the answer, just say that "i am unable to answer your query, for more information contact your HRBP", don't try to make up an answer.
132
+ Use three sentences maximum and keep the answer as concise as possible.
133
+ {context}
134
+ Question: {question}
135
+ Helpful Answer:"""
136
+
137
+ QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
138
+ print("====================="*10)
139
+ print("Loading Documents for Ensemble Retriver")
140
+ print("====================="*10)
141
+
142
+ docs = DirectoryLoader(self.DATA_DIRECTORY).load()
143
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=10)
144
+ all_splits = text_splitter.split_documents(docs)
145
+
146
+ bm25_retriever = BM25Retriever.from_documents(all_splits)
147
+ # GEttting only two relevant documents
148
+ bm25_retriever.k = 2
149
+
150
+ ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,
151
+ self.vectorstore.as_retriever(search_kwargs={"k": 2})],
152
+ weights=[0.5, 0.5])
153
+
154
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
155
+
156
+ self.bot = RetrievalQA.from_chain_type(
157
+ llm,
158
+ retriever=ensemble_retriever,
159
+ chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
160
+
161
+
162
+
163
+ if __name__ == "__main__":
164
+ # Set your configuration here
165
+ load_dotenv()
166
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
167
+ DATA_DIRECTORY = os.getenv("DATA_DIRECTORY")
168
+ VECTOR_STORE_DIRECTORY = os.getenv("VECTOR_STORE_DIRCTORY")
169
+ VECTOR_STORE_CHECK = os.getenv("VECTOR_STORE_CHECK")
170
+
171
+ DEBUG = os.getenv("DEBUG")
172
+ USE_HYDE = os.getenv("USE_HYDE")
173
+ # Initialize Jine and start chatting
174
+ jine = Jine(OPENAI_API_KEY, VECTOR_STORE_DIRECTORY, VECTOR_STORE_CHECK, DATA_DIRECTORY, DEBUG)
175
+ # print(jine.VECTOR_STORE_CHECK)
176
+ jine.load_model()
177
+ while True:
178
+ user_question = input("You: ")
179
+ if user_question.lower() in ["exit", "quit"]:
180
+ break
181
+ response = jine.chat(user_question)
182
+ print("JIN-e:", response)
requirements ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain
2
+ chromadb
3
+ "unstructured[all-docs]"
4
+ openai
5
+ fastapi
6
+ uvicorn
requirements.txt ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file may be used to create an environment using:
2
+ # $ conda create --name <env> --file <this file>
3
+ # platform: win-64
4
+ aiohttp==3.8.4
5
+ aiosignal==1.3.1
6
+ altair==4.2.2
7
+ anyio==3.6.2
8
+ argilla==1.5.0
9
+ asttokens==2.0.5
10
+ async-timeout==4.0.2
11
+ attrs==22.2.0
12
+ backcall==0.2.0
13
+ backoff==2.2.1
14
+ beautifulsoup4==4.12.0
15
+ bertopic==0.13.0
16
+ blinker==1.6.2
17
+ blis==0.7.9
18
+ bs4==0.0.1
19
+ cachetools==5.3.0
20
+ catalogue==2.0.8
21
+ certifi==2022.12.7
22
+ cffi==1.15.1
23
+ charset-normalizer==2.1.1
24
+ chromadb==0.3.11
25
+ click==8.1.3
26
+ clickhouse-connect==0.5.16
27
+ colorama==0.4.6
28
+ commonmark==0.9.1
29
+ confection==0.0.3
30
+ cryptography==40.0.0
31
+ cymem==2.0.7
32
+ cython==0.29.32
33
+ dataclasses-json==0.5.7
34
+ debugpy==1.5.1
35
+ decorator==5.1.1
36
+ deprecated==1.2.13
37
+ docx2txt==0.8
38
+ duckdb==0.7.1
39
+ entrypoints==0.4
40
+ et-xmlfile==1.1.0
41
+ executing==0.8.3
42
+ faker==17.6.0
43
+ fastapi==0.95.0
44
+ filelock==3.9.0
45
+ flask==2.3.3
46
+ flask-sqlalchemy==3.0.5
47
+ flatbuffers==23.5.26
48
+ frozenlist==1.3.3
49
+ fst-pso==1.8.1
50
+ funcy==1.17
51
+ future==0.18.2
52
+ fuzzytm==2.0.5
53
+ gensim==4.3.0
54
+ gitdb==4.0.10
55
+ gitpython==3.1.31
56
+ google-search-results==2.4.2
57
+ greenlet==2.0.1
58
+ h11==0.14.0
59
+ hdbscan==0.8.29
60
+ hnswlib==0.7.0
61
+ httpcore==0.16.3
62
+ httptools==0.5.0
63
+ httpx==0.23.3
64
+ huggingface-hub==0.11.1
65
+ humanfriendly==10.0
66
+ idna==3.4
67
+ importlib-metadata==6.1.0
68
+ importlib-resources==6.0.1
69
+ ipykernel==6.15.2
70
+ ipython==8.7.0
71
+ itsdangerous==2.1.2
72
+ jedi==0.18.1
73
+ jinja2==3.1.2
74
+ joblib==1.2.0
75
+ jsonschema==4.17.3
76
+ jupyter_client==7.4.8
77
+ jupyter_core==5.1.1
78
+ langchain==0.0.284
79
+ langcodes==3.3.0
80
+ langsmith==0.0.33
81
+ llama-index==0.5.5
82
+ llvmlite==0.39.1
83
+ lxml==4.9.2
84
+ lz4==4.3.2
85
+ markdown==3.4.3
86
+ markdown-it-py==2.2.0
87
+ markupsafe==2.1.1
88
+ marshmallow==3.19.0
89
+ marshmallow-enum==1.5.1
90
+ matplotlib-inline==0.1.6
91
+ mdurl==0.1.2
92
+ miniful==0.0.6
93
+ monotonic==1.6
94
+ mpmath==1.3.0
95
+ multidict==6.0.4
96
+ murmurhash==1.0.9
97
+ mypy-extensions==1.0.0
98
+ nest-asyncio==1.5.6
99
+ nltk==3.8.1
100
+ numba==0.56.4
101
+ numexpr==2.8.4
102
+ numpy==1.23.5
103
+ openai==0.27.2
104
+ opencv-python==4.7.0.72
105
+ openpyxl==3.1.1
106
+ overrides==7.4.0
107
+ packaging==22.0
108
+ pandas==1.5.2
109
+ parso==0.8.3
110
+ pathy==0.10.1
111
+ pdfminer-six==20221105
112
+ pickleshare==0.7.5
113
+ pillow==9.4.0
114
+ pip==22.3.1
115
+ platformdirs==2.5.2
116
+ plotly==5.11.0
117
+ preshed==3.0.8
118
+ prompt-toolkit==3.0.36
119
+ protobuf==3.20.3
120
+ psutil==5.9.0
121
+ pulsar-client==3.3.0
122
+ pure_eval==0.2.2
123
+ pyarrow==11.0.0
124
+ pycparser==2.21
125
+ pydantic==1.10.4
126
+ pydeck==0.8.0
127
+ pyfume==0.2.25
128
+ pygments==2.14.0
129
+ pyldavis==3.3.1
130
+ pympler==1.0.1
131
+ pynndescent==0.5.8
132
+ pyodbc==4.0.35
133
+ pypandoc==1.11
134
+ pypdf2==3.0.1
135
+ pypika==0.48.9
136
+ pypyodbc==1.3.6
137
+ pyreadline3==3.4.1
138
+ pyrsistent==0.19.3
139
+ python-dateutil==2.8.2
140
+ python-docx==0.8.11
141
+ python-dotenv==1.0.0
142
+ python-magic==0.4.27
143
+ python-pptx==0.6.21
144
+ pytz==2022.7
145
+ pytz-deprecation-shim==0.1.0.post0
146
+ pywin32==305
147
+ pyyaml==6.0
148
+ pyzmq==23.2.0
149
+ regex==2022.10.31
150
+ requests==2.28.1
151
+ rfc3986==1.5.0
152
+ rich==13.0.1
153
+ scikit-learn==1.2.0
154
+ scipy==1.10.0
155
+ semver==2.13.0
156
+ sentence-transformers==2.2.2
157
+ sentencepiece==0.1.97
158
+ setuptools==65.5.0
159
+ simpful==2.9.0
160
+ six==1.16.0
161
+ sklearn==0.0.post1
162
+ smart-open==6.3.0
163
+ smmap==5.0.0
164
+ sniffio==1.3.0
165
+ soupsieve==2.4
166
+ spacy==3.4.4
167
+ spacy-legacy==3.0.11
168
+ spacy-loggers==1.0.4
169
+ sqlalchemy==2.0.20
170
+ sqlite==3.40.0
171
+ srsly==2.4.5
172
+ stack_data==0.2.0
173
+ starlette==0.26.1
174
+ streamlit==1.20.0
175
+ streamlit-chat==0.0.2.2
176
+ sympy==1.12
177
+ tenacity==8.2.2
178
+ thinc==8.1.6
179
+ threadpoolctl==3.1.0
180
+ tiktoken==0.3.2
181
+ tk==8.6.12
182
+ tokenizers==0.13.2
183
+ toml==0.10.2
184
+ toolz==0.12.0
185
+ torch==1.13.1
186
+ torchvision==0.14.1
187
+ tornado==6.2
188
+ tqdm==4.66.1
189
+ traitlets==5.7.1
190
+ transformers==4.25.1
191
+ typer==0.7.0
192
+ typing-extensions==4.7.1
193
+ typing-inspect==0.8.0
194
+ tzdata==2023.3
195
+ tzlocal==4.3
196
+ umap-learn==0.5.3
197
+ unstructured==0.5.7
198
+ urllib3==1.26.13
199
+ uvicorn==0.21.1
200
+ validators==0.20.0
201
+ vc==14.2
202
+ vs2015_runtime==14.27.29016
203
+ wasabi==0.10.1
204
+ watchdog==3.0.0
205
+ watchfiles==0.18.1
206
+ wcwidth==0.2.5
207
+ websockets==10.4
208
+ werkzeug==2.3.7
209
+ wheel==0.37.1
210
+ wincertstore==0.2
211
+ wrapt==1.14.1
212
+ xlsxwriter==3.0.9
213
+ xz==5.2.8
214
+ yarl==1.8.2
215
+ zeromq==4.3.4
216
+ zipp==3.15.0
217
+ zlib==1.2.13
218
+ zstandard==0.20.0
streamlit_interface.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_chat import message
3
+ import os
4
+
5
+ ##### Importing JIN-e
6
+ from jine import Jine
7
+ from dotenv import load_dotenv
8
+ import os
9
+
10
+
11
+ load_dotenv()
12
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
+ DATA_DIRECTORY = os.getenv("DATA_DIRECTORY")
14
+ VECTOR_STORE_DIRECTORY = os.getenv("VECTOR_STORE_DIRCTORY")
15
+ VECTOR_STORE_CHECK = os.getenv("VECTOR_STORE_CHECK")
16
+ DEBUG = os.getenv("DEBUG")
17
+ USE_HYDE = os.getenv("USE_HYDE")
18
+
19
+ # Initialize Jine
20
+
21
+
22
+ @st.cache_resource()
23
+ def load_model():
24
+ jine = Jine(OPENAI_API_KEY, VECTOR_STORE_DIRECTORY, VECTOR_STORE_CHECK, DATA_DIRECTORY, DEBUG,USE_HYDE)
25
+ jine.load_model()
26
+ return jine
27
+
28
+ jine =load_model()
29
+
30
+ import streamlit as st
31
+ from streamlit_chat import message
32
+
33
+ # st.set_page_config(
34
+ # page_title="JIN-e",
35
+ # page_icon=":robot:"
36
+ # )
37
+ # #
38
+
39
+ st.header("JIN-e")
40
+ st.markdown("Powered by People Analytics")
41
+
42
+ if 'generated' not in st.session_state:
43
+ st.session_state['generated'] = []
44
+
45
+ if 'past' not in st.session_state:
46
+ st.session_state['past'] = []
47
+
48
+ # def query(payload):
49
+ # response = requests.post(API_URL, headers=headers, json=payload)
50
+ # return response.json()
51
+
52
+ def get_text():
53
+ input_text = st.text_input("You: ","Hello, how are you?", key="input")
54
+ return input_text
55
+
56
+
57
+ user_input = get_text()
58
+
59
+ if user_input:
60
+
61
+ response = jine.chat(user_input)
62
+
63
+ st.session_state.past.append(user_input)
64
+ st.session_state.generated.append(response)
65
+
66
+ if st.session_state['generated']:
67
+
68
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
69
+ message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
70
+ message(st.session_state["generated"][i], key=str(i))
71
+
72
+
73
+