Spaces:
Sleeping
Sleeping
cga-telice
commited on
Commit
•
2ca8127
1
Parent(s):
d22e0f0
Upload 7 files
Browse files- .gitignore +160 -0
- Dockerfile +11 -0
- README.md +61 -9
- app.py +132 -0
- chainlit.md +9 -0
- requirements.txt +11 -0
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
RUN useradd -m -u 1000 user
|
3 |
+
USER user
|
4 |
+
ENV HOME=/home/user \
|
5 |
+
PATH=/home/user/.local/bin:$PATH
|
6 |
+
WORKDIR $HOME/app
|
7 |
+
COPY --chown=user . $HOME/app
|
8 |
+
COPY ./requirements.txt ~/app/requirements.txt
|
9 |
+
RUN pip install -r requirements.txt
|
10 |
+
COPY . .
|
11 |
+
CMD ["chainlit", "run", "app.py", "--port", "7860"]
|
README.md
CHANGED
@@ -1,11 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
title: SafeMate
|
3 |
-
emoji: 🦀
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: gray
|
6 |
-
sdk: docker
|
7 |
-
pinned: false
|
8 |
-
license: openrail
|
9 |
-
---
|
10 |
|
11 |
-
|
|
|
|
1 |
+
# SafeMate
|
2 |
+
|
3 |
+
Welcome to **SafeMate**, an occupational health and safety assistant designed to serve construction field workers. This project is currently in the proof of concept stage and aims to provide immediate, accessible safety guidelines and answers derived from authoritative sources. The core of this assistant is built upon a chat interface that draws its responses from the safety manual provided by the Port of Seattle, offering a practical and user-friendly approach to workplace safety.
|
4 |
+
|
5 |
+
## Project Overview
|
6 |
+
|
7 |
+
SafeMate is crafted to transform the way construction workers interact with safety manuals, making it easier than ever to get answers to crucial safety questions on the job. By leveraging state-of-the-art machine learning models and natural language processing techniques, SafeMate delivers relevant safety information through a simple chat interface.
|
8 |
+
|
9 |
+
### Current State of the Project
|
10 |
+
|
11 |
+
This project is in its early stages, and the current implementation serves as a proof of concept. The backbone of SafeMate is a chat interface that allows users to query a safety manual, specifically the one provided by the Port of Seattle, which is publicly available online.
|
12 |
+
|
13 |
+
### Project Structure
|
14 |
+
|
15 |
+
The project is organized as follows:
|
16 |
+
|
17 |
+
D:.
|
18 |
+
├───.chainlit
|
19 |
+
│ └───translations
|
20 |
+
├───.files
|
21 |
+
├───app
|
22 |
+
├───data
|
23 |
+
│ ├───processed
|
24 |
+
│ ├───raw
|
25 |
+
│ └───results
|
26 |
+
├───models
|
27 |
+
├───notebooks
|
28 |
+
│ └───.ipynb_checkpoints
|
29 |
+
├───scripts
|
30 |
+
├───utils
|
31 |
+
└───__pycache__
|
32 |
+
|
33 |
+
Note: Some folders have been created to accommodate future developments and are not in use at the moment.
|
34 |
+
|
35 |
+
### Achievements So Far
|
36 |
+
|
37 |
+
1. **Synthetic QA Development**: Utilization of the RAGAS library to assess embeddings and models for effective question-answering.
|
38 |
+
2. **Data Processing**: Transformation of the raw PDF manual into a structured format, with embeddings stored in a vector store for quick retrieval.
|
39 |
+
3. **User Interface**: Implementation of ChainLit as the user interface, facilitating a seamless interaction where users can prompt questions, and the system retrieves context to provide informed responses.
|
40 |
+
4. **Accessibility**: The project's code is hosted on GitHub, with a live demo available on a Hugging Face space, demonstrating the capabilities and potential of SafeMate.
|
41 |
+
|
42 |
+
## Future Directions
|
43 |
+
|
44 |
+
As SafeMate evolves, we plan to expand the knowledge base beyond the Port of Seattle's safety manual, incorporate feedback mechanisms for continuous improvement, and refine our models for better accuracy and user experience.
|
45 |
+
|
46 |
+
## Contact
|
47 |
+
|
48 |
+
Cesáreo González Alvarez
|
49 |
+
Github: cga-telice
|
50 |
+
Linkedin: (https://www.linkedin.com/in/caesaripse/)
|
51 |
+
|
52 |
+
## License
|
53 |
+
|
54 |
+
SafeMate is released under the [MIT License](LICENSE).
|
55 |
+
|
56 |
+
## Acknowledgments
|
57 |
+
|
58 |
+
A special thank you to the Port of Seattle for making their safety manual publicly accessible and serving as the foundation for this project.
|
59 |
+
|
60 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
For more information, updates, and to try out the SafeMate proof of concept, please visit our [GitHub repository](link-to-repo) and [Hugging Face demo space](link-to-demo).
|
63 |
+
|
app.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# You can find this code for Chainlit python streaming here (https://docs.chainlit.io/concepts/streaming/python)
|
2 |
+
|
3 |
+
# OpenAI Chat completion
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from getpass import getpass
|
7 |
+
from operator import itemgetter
|
8 |
+
|
9 |
+
import openai
|
10 |
+
from openai import AsyncOpenAI # importing openai for API usage
|
11 |
+
|
12 |
+
import chainlit as cl # importing chainlit for our app
|
13 |
+
from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
|
14 |
+
from chainlit.playground.providers import ChatOpenAI # importing ChatOpenAI tools
|
15 |
+
|
16 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
+
from langchain_openai import OpenAIEmbeddings
|
19 |
+
from langchain_community.vectorstores import FAISS
|
20 |
+
from langchain import hub
|
21 |
+
from langchain.prompts import ChatPromptTemplate
|
22 |
+
import faiss
|
23 |
+
|
24 |
+
# Working directory
|
25 |
+
# Get the absolute path to the directory containing the script
|
26 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
27 |
+
|
28 |
+
# Change the current working directory to the script directory
|
29 |
+
os.chdir(script_dir)
|
30 |
+
|
31 |
+
# Set the enviroment variables
|
32 |
+
|
33 |
+
load_dotenv()
|
34 |
+
|
35 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
36 |
+
|
37 |
+
# Load the document vector store
|
38 |
+
file_name = "Seattle"
|
39 |
+
path_raw = "data//raw//" + file_name + ".pdf" # The path where the raw documents are stored
|
40 |
+
path_processed = "data//processed//" + file_name + ".faiss" # The path where we will store the index
|
41 |
+
|
42 |
+
embeddings = OpenAIEmbeddings(
|
43 |
+
model="text-embedding-3-small"
|
44 |
+
)
|
45 |
+
|
46 |
+
vector_store = FAISS.from_texts([""], embeddings)
|
47 |
+
|
48 |
+
vs=vector_store.load_local(path_processed, embeddings, allow_dangerous_deserialization=True)
|
49 |
+
|
50 |
+
# Build the retriever
|
51 |
+
|
52 |
+
retriever = vs.as_retriever()
|
53 |
+
|
54 |
+
# Configure the Prompt templates
|
55 |
+
|
56 |
+
system_template = """You are a helpful but prudent occupational health and
|
57 |
+
safety assistant. Your anwswers will be grounded on the context.
|
58 |
+
If you don´t know an answer you will say that you don´t know.
|
59 |
+
"""
|
60 |
+
|
61 |
+
user_template = """
|
62 |
+
# Context:
|
63 |
+
{context}
|
64 |
+
|
65 |
+
# Question:
|
66 |
+
{question}
|
67 |
+
"""
|
68 |
+
|
69 |
+
@cl.on_chat_start # marks a function that will be executed at the start of a user session
|
70 |
+
async def start_chat():
|
71 |
+
settings = {
|
72 |
+
"model": "gpt-3.5-turbo",
|
73 |
+
"temperature": 0,
|
74 |
+
"max_tokens": 500,
|
75 |
+
"top_p": 1,
|
76 |
+
"frequency_penalty": 0,
|
77 |
+
"presence_penalty": 0,
|
78 |
+
}
|
79 |
+
|
80 |
+
cl.user_session.set("settings", settings)
|
81 |
+
|
82 |
+
|
83 |
+
@cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
|
84 |
+
async def main(message: cl.Message):
|
85 |
+
settings = cl.user_session.get("settings") # gets the settings of the started session
|
86 |
+
|
87 |
+
client = AsyncOpenAI()
|
88 |
+
|
89 |
+
document_list = retriever.invoke(message.content)
|
90 |
+
context = " ".join([doc.page_content for doc in document_list])
|
91 |
+
|
92 |
+
print(message.content)
|
93 |
+
|
94 |
+
print(context)
|
95 |
+
|
96 |
+
prompt = Prompt(
|
97 |
+
provider=ChatOpenAI.id,
|
98 |
+
messages=[
|
99 |
+
PromptMessage(
|
100 |
+
role="system",
|
101 |
+
template=system_template,
|
102 |
+
formatted=system_template,
|
103 |
+
),
|
104 |
+
PromptMessage(
|
105 |
+
role="user",
|
106 |
+
template=user_template,
|
107 |
+
formatted=user_template.format(question=message.content, context=context),
|
108 |
+
),
|
109 |
+
],
|
110 |
+
inputs={"question": message.content, "context": context},
|
111 |
+
settings=settings,
|
112 |
+
)
|
113 |
+
|
114 |
+
print([m.to_openai() for m in prompt.messages])
|
115 |
+
|
116 |
+
msg = cl.Message(content="")
|
117 |
+
|
118 |
+
# Call OpenAI
|
119 |
+
async for stream_resp in await client.chat.completions.create(
|
120 |
+
messages=[m.to_openai() for m in prompt.messages], stream=True, **settings
|
121 |
+
):
|
122 |
+
token = stream_resp.choices[0].delta.content
|
123 |
+
if not token:
|
124 |
+
token = ""
|
125 |
+
await msg.stream_token(token)
|
126 |
+
|
127 |
+
# Update the prompt object with the completion
|
128 |
+
prompt.completion = msg.content
|
129 |
+
msg.prompt = prompt
|
130 |
+
|
131 |
+
# Send and close the message stream
|
132 |
+
await msg.send()
|
chainlit.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to SafeMate! 🚀🤖
|
2 |
+
|
3 |
+
Hi there! 👋 We're excited to have you on board.
|
4 |
+
|
5 |
+
**SafeMate** is proof of concept app designed to assist construction field workers with occupational health and safety documentation.
|
6 |
+
|
7 |
+
At this point the app is a chat with a document, a [Construction Safety Manual from the Port of seattle](https://www.portseattle.org/sites/default/files/2018-03/Construction_Safety_Manual.pdf)
|
8 |
+
|
9 |
+
**Go ahead and chat with the Safety Manual!**
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
chainlit==0.7.700
|
2 |
+
cohere==4.37
|
3 |
+
openai
|
4 |
+
tiktoken==0.5.2
|
5 |
+
python-dotenv==1.0.0
|
6 |
+
langchain
|
7 |
+
langchain-core
|
8 |
+
langchain-community
|
9 |
+
langchain-openai
|
10 |
+
langchainhub
|
11 |
+
faiss_cpu
|