axionable commited on
Commit
03d828b
1 Parent(s): d33391f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf filter=lfs diff=lfs merge=lfs -text
37
+ PDF/deu-2023.pdf filter=lfs diff=lfs merge=lfs -text
38
+ PDF/memo_risques_physiques_focus_batiment_2022.pdf filter=lfs diff=lfs merge=lfs -text
39
+ vectors/index.annoy filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setAPIKEY.sh
2
+
3
+ # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
4
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos
5
+
6
+ ### macOS ###
7
+ # General
8
+ .DS_Store
9
+ .AppleDouble
10
+ .LSOverride
11
+
12
+ # Historique conversasion with chatbot
13
+ *.json
14
+
15
+ # Icon must end with two \r
16
+ Icon
17
+
18
+ # files for RAG
19
+ sources/*
20
+ categories.csv
21
+
22
+ # Thumbnails
23
+ ._*
24
+
25
+ # Files that might appear in the root of a volume
26
+ .DocumentRevisions-V100
27
+ .fseventsd
28
+ .Spotlight-V100
29
+ .TemporaryItems
30
+ .Trashes
31
+ .VolumeIcon.icns
32
+ .com.apple.timemachine.donotpresent
33
+
34
+ # Directories potentially created on remote AFP share
35
+ .AppleDB
36
+ .AppleDesktop
37
+ Network Trash Folder
38
+ Temporary Items
39
+ .apdisk
40
+
41
+ ### macOS Patch ###
42
+ # iCloud generated files
43
+ *.icloud
44
+
45
+ ### Python ###
46
+ # Byte-compiled / optimized / DLL files
47
+ __pycache__/
48
+ *.py[cod]
49
+ *$py.class
50
+
51
+ # C extensions
52
+ *.so
53
+
54
+ # Distribution / packaging
55
+ .Python
56
+ build/
57
+ develop-eggs/
58
+ dist/
59
+ downloads/
60
+ eggs/
61
+ .eggs/
62
+ lib/
63
+ lib64/
64
+ parts/
65
+ sdist/
66
+ var/
67
+ wheels/
68
+ share/python-wheels/
69
+ *.egg-info/
70
+ .installed.cfg
71
+ *.egg
72
+ MANIFEST
73
+
74
+ # PyInstaller
75
+ # Usually these files are written by a python script from a template
76
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
77
+ *.manifest
78
+ *.spec
79
+
80
+ # Installer logs
81
+ pip-log.txt
82
+ pip-delete-this-directory.txt
83
+
84
+ # Unit test / coverage reports
85
+ htmlcov/
86
+ .tox/
87
+ .nox/
88
+ .coverage
89
+ .coverage.*
90
+ .cache
91
+ nosetests.xml
92
+ coverage.xml
93
+ *.cover
94
+ *.py,cover
95
+ .hypothesis/
96
+ .pytest_cache/
97
+ cover/
98
+
99
+ # Translations
100
+ *.mo
101
+ *.pot
102
+
103
+ # Django stuff:
104
+ *.log
105
+ local_settings.py
106
+ db.sqlite3
107
+ db.sqlite3-journal
108
+
109
+ # Flask stuff:
110
+ instance/
111
+ .webassets-cache
112
+
113
+ # Scrapy stuff:
114
+ .scrapy
115
+
116
+ # Sphinx documentation
117
+ docs/_build/
118
+
119
+ # PyBuilder
120
+ .pybuilder/
121
+ target/
122
+
123
+ # Jupyter Notebook
124
+ .ipynb_checkpoints
125
+
126
+ # IPython
127
+ profile_default/
128
+ ipython_config.py
129
+
130
+ # pyenv
131
+ # For a library or package, you might want to ignore these files since the code is
132
+ # intended to run in multiple environments; otherwise, check them in:
133
+ # .python-version
134
+
135
+ # pipenv
136
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
137
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
138
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
139
+ # install all needed dependencies.
140
+ #Pipfile.lock
141
+
142
+ # poetry
143
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
144
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
145
+ # commonly ignored for libraries.
146
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
147
+ #poetry.lock
148
+
149
+ # pdm
150
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
151
+ #pdm.lock
152
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
153
+ # in version control.
154
+ # https://pdm.fming.dev/#use-with-ide
155
+ .pdm.toml
156
+
157
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
158
+ __pypackages__/
159
+
160
+ # Celery stuff
161
+ celerybeat-schedule
162
+ celerybeat.pid
163
+
164
+ # SageMath parsed files
165
+ *.sage.py
166
+
167
+ # Environments
168
+ .env
169
+ .venv
170
+ env/
171
+ venv/
172
+ ENV/
173
+ env.bak/
174
+ venv.bak/
175
+
176
+ # Spyder project settings
177
+ .spyderproject
178
+ .spyproject
179
+
180
+ # Rope project settings
181
+ .ropeproject
182
+
183
+ # mkdocs documentation
184
+ /site
185
+
186
+ # mypy
187
+ .mypy_cache/
188
+ .dmypy.json
189
+ dmypy.json
190
+
191
+ # Pyre type checker
192
+ .pyre/
193
+
194
+ # pytype static type analyzer
195
+ .pytype/
196
+
197
+ # Cython debug symbols
198
+ cython_debug/
199
+
200
+ # PyCharm
201
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
202
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
203
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
204
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
205
+ #.idea/
206
+
207
+ ### Python Patch ###
208
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
209
+ poetry.toml
210
+
211
+ # ruff
212
+ .ruff_cache/
213
+
214
+ # LSP config files
215
+ pyrightconfig.json
216
+
217
+ ### VisualStudioCode ###
218
+ .vscode/*
219
+ !.vscode/settings.json
220
+ !.vscode/tasks.json
221
+ !.vscode/launch.json
222
+ !.vscode/extensions.json
223
+ !.vscode/*.code-snippets
224
+
225
+ # Local History for Visual Studio Code
226
+ .history/
227
+
228
+ # Built Visual Studio Code Extensions
229
+ *.vsix
230
+
231
+ ### VisualStudioCode Patch ###
232
+ # Ignore all local history of files
233
+ .history
234
+ .ionide
235
+
236
+ # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
PDF/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9d2d29a6545fc1949b10eb8428e6fac632aa84020fa61f4f76600817a21cd5
3
+ size 2079496
PDF/deu-2023.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ea20da6494b2de2ae4d1f45dd309ee72700acf676a3d5dfdbf4f2cec8408bb
3
+ size 9714830
PDF/memo_risques_physiques_focus_batiment_2022.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c3f8c224d1e3d269e7688b1a49cff025f24a67bfa156306ce94ed5d3ede0720
3
+ size 5330523
app.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # , get_pinecone_vectorstore, find_similar_vectors
4
+ from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP, get_categories_files
5
+ from climateqa.engine.text_retriever import ClimateQARetriever
6
+ from climateqa.engine.rag import make_rag_chain
7
+ from climateqa.engine.llm import get_llm
8
+ from utils import create_user_id
9
+ from datetime import datetime
10
+ import json
11
+ import re
12
+ import gradio as gr
13
+ from sentence_transformers import CrossEncoder
14
+
15
+ reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
16
+
17
+ # Load environment variables in local mode
18
+ try:
19
+ from dotenv import load_dotenv
20
+ load_dotenv()
21
+ except Exception as e:
22
+ pass
23
+
24
+ # Set up Gradio Theme
25
+ theme = gr.themes.Soft(
26
+ primary_hue="yellow",
27
+ secondary_hue="orange",
28
+ font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
29
+ "system-ui", "sans-serif"],
30
+ )
31
+
32
+
33
+ init_prompt = ""
34
+
35
+ system_template = {
36
+ "role": "system",
37
+ "content": init_prompt,
38
+ }
39
+
40
+ user_id = create_user_id()
41
+
42
+ list_categorie = get_categories_files()
43
+ categories=list_categorie["AllCat"]
44
+
45
+ def parse_output_llm_with_sources(output):
46
+ # Split the content into a list of text and "[Doc X]" references
47
+ content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
48
+ parts = []
49
+ for part in content_parts:
50
+ if part.startswith("Doc"):
51
+ subparts = part.split(",")
52
+
53
+ subparts = [subpart.lower().replace("doc", "").strip()
54
+ for subpart in subparts]
55
+ subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup style="color:#FFC000 !important;">({
56
+ subpart})</sup></span></a>""" for subpart in subparts]
57
+ parts.append("".join(subparts))
58
+ else:
59
+ parts.append(part)
60
+ content_parts = "".join(parts)
61
+ return content_parts
62
+
63
+
64
+ def serialize_docs(docs):
65
+ new_docs = []
66
+ for doc in docs:
67
+ new_doc = {}
68
+ new_doc["page_content"] = doc.page_content
69
+ new_doc["metadata"] = doc.metadata
70
+ new_docs.append(new_doc)
71
+ return new_docs
72
+
73
+
74
+ # Create vectorstore and retriever
75
+ vectorstore = build_vectores_stores("./sources")
76
+ llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
77
+
78
+
79
+ async def chat(query, history, categories, src_nb_max, src_pertinence):
80
+ """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
81
+ (messages in gradio format, messages in langchain format, source documents)"""
82
+
83
+ print(f">> NEW QUESTION : {query} -> sources max:{src_nb_max} - pertience: {src_pertinence}")
84
+
85
+ filter = None
86
+ if len(categories):
87
+ filter={ "$or" : [] }
88
+ for cat in categories:
89
+ for fich in list_categorie[cat]:
90
+ filter["$or"].append({"ax_name": fich})
91
+
92
+ print( ">> Filter :" + str(filter) )
93
+ print( ">> nb sources :" + str(src_nb_max) )
94
+ print( ">> pertinence :" + str(src_pertinence) )
95
+
96
+ retriever = ClimateQARetriever(
97
+ vectorstore=vectorstore, sources=["Custom"], reports=[],
98
+ threshold=src_pertinence, k_total=src_nb_max, filter=filter
99
+ )
100
+ rag_chain = make_rag_chain(retriever, llm)
101
+
102
+ inputs = {"query": query, "audience": None}
103
+ result = rag_chain.astream_log(inputs)
104
+
105
+ path_reformulation = "/logs/reformulation/final_output"
106
+ path_keywords = "/logs/keywords/final_output"
107
+ path_retriever = "/logs/find_documents/final_output"
108
+ path_answer = "/logs/answer/streamed_output_str/-"
109
+
110
+ docs_html = ""
111
+ output_query = ""
112
+ output_language = ""
113
+ output_keywords = ""
114
+ gallery = []
115
+
116
+ try:
117
+ async for op in result:
118
+
119
+ op = op.ops[0]
120
+
121
+ if op['path'] == path_reformulation: # reforulated question
122
+ try:
123
+ output_language = op['value']["language"] # str
124
+ output_query = op["value"]["question"]
125
+ except Exception as e:
126
+ raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
127
+
128
+ if op["path"] == path_keywords:
129
+ try:
130
+ output_keywords = op['value']["keywords"] # str
131
+ output_keywords = " AND ".join(output_keywords)
132
+ except Exception as e:
133
+ pass
134
+
135
+ elif op['path'] == path_retriever: # documents
136
+ try:
137
+ docs = op['value']['docs'] # List[Document]
138
+ docs_html = []
139
+ for i, d in enumerate(docs, 1):
140
+ docs_html.append(make_html_source(d, i))
141
+ docs_html = "".join(docs_html)
142
+ except TypeError:
143
+ print("No documents found")
144
+ print("op: ", op)
145
+ continue
146
+
147
+ elif op['path'] == path_answer: # final answer
148
+ new_token = op['value'] # str
149
+ # time.sleep(0.01)
150
+ previous_answer = history[-1][1]
151
+ previous_answer = previous_answer if previous_answer is not None else ""
152
+ answer_yet = previous_answer + new_token
153
+ answer_yet = parse_output_llm_with_sources(answer_yet)
154
+ history[-1] = (query, answer_yet)
155
+
156
+ else:
157
+ continue
158
+
159
+ history = [tuple(x) for x in history]
160
+ yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
161
+
162
+ except Exception as e:
163
+ raise gr.Error(f"{e}")
164
+
165
+ timestamp = str(datetime.now().timestamp())
166
+ log_file = "logs/" + timestamp + ".json"
167
+ prompt = history[-1][0]
168
+ logs = {
169
+ "user_id": str(user_id),
170
+ "prompt": prompt,
171
+ "query": prompt,
172
+ "question": output_query,
173
+ "sources": ["Custom"],
174
+ "docs": serialize_docs(docs),
175
+ "answer": history[-1][1],
176
+ "time": timestamp,
177
+ }
178
+ #log_locally(log_file, logs)
179
+
180
+ yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
181
+
182
+
183
+ def make_html_source(source, i):
184
+ # Prépare le contenu HTML pour un fichier texte
185
+ text_content = source.page_content.strip()
186
+ meta = source.metadata
187
+ # Nom de la source
188
+ name = f"<b>Document {i}</b>"
189
+
190
+ # Contenu HTML de la carte
191
+ card = f"""
192
+ <div class="card" id="doc{i}">
193
+ <div class="card-content">
194
+ <div>
195
+ <div style="float:right;width 10%;position:relative;top:0px">
196
+ <a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
197
+ </div>
198
+ <div>
199
+ <h2>Extrait {i} (Score:{float(meta['similarity_score'])})</h2>
200
+ <h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2>
201
+ </div>
202
+ </div>
203
+ <p>{text_content}</p>
204
+
205
+ </div>
206
+ <!-- <div class="card-footer">
207
+ <span>{name}</span>
208
+ </div> -->
209
+ </div>
210
+ """
211
+
212
+ return card
213
+
214
+ def log_locally(file, logs):
215
+ # Convertit les logs en format JSON
216
+ logs_json = json.dumps(logs)
217
+
218
+ # Écrit les logs dans un fichier local
219
+ with open(file, 'w') as f:
220
+ f.write(logs_json)
221
+
222
+
223
+ # --------------------------------------------------------------------
224
+ # Gradio
225
+ # --------------------------------------------------------------------
226
+
227
+ init_prompt = """
228
+ Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines.
229
+
230
+ ❓ How to interact with Clara
231
+
232
+ Ask your question: You can ask me anything you want to know. I'll provide an answer based on the extracted passages and other relevant sources.
233
+ Response structure: I aim to provide clear and structured answers using the given data.
234
+ Guidelines: I follow specific guidelines to ensure that my responses are accurate and useful.
235
+ ⚠️ Limitations
236
+ Though I do my best to help, there might be times when my responses are incorrect or incomplete. If that happens, please feel free to ask for more information or provide feedback to help improve my performance.
237
+
238
+ What would you like to know today?
239
+ """
240
+
241
+
242
+ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
243
+
244
+ gr.HTML("""
245
+ <img style="width:100px" src="file/assets/axionable.svg"/>
246
+ """, elem_classes="logo-axio ")
247
+
248
+ # TAB Clara
249
+ with gr.Tab("CLARA"):
250
+
251
+ with gr.Row(elem_id="chatbot-row"):
252
+ with gr.Column(scale=2):
253
+ chatbot = gr.Chatbot(
254
+ value=[(None, init_prompt)],
255
+ show_copy_button=True, show_label=False, elem_id="chatbot", layout="panel",
256
+ avatar_images=(None, "assets/logo4.png"))
257
+
258
+ with gr.Row(elem_id="input-message"):
259
+ textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
260
+ scale=7, lines=1, interactive=True, elem_id="input-textbox")
261
+
262
+
263
+ with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
264
+
265
+ # with gr.Column(scale=1, elem_id="tab-citations"):
266
+
267
+ # gr.HTML("<p>Sources</p>")
268
+
269
+ # slider = gr.Slider(1, 10, value=src_nb_max, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
270
+ # slider_p = gr.Slider(0.0, 1.0, value=src_pertinence, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
271
+
272
+ # sources_textbox = gr.HTML(
273
+ # show_label=False, elem_id="sources-textbox")
274
+ # docs_textbox = gr.State("")
275
+
276
+
277
+
278
+ # l'object tabs est necessaire actuellement
279
+ # J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
280
+ # pendant que l'ia gènère une reponse ..
281
+ with gr.Tabs() as tabs:
282
+ # None
283
+
284
+ with gr.Tab("sources"):
285
+ sources_textbox = gr.HTML(
286
+ show_label=False, elem_id="sources-textbox")
287
+ docs_textbox = gr.State("")
288
+
289
+ with gr.Tab("filtres"):
290
+
291
+ cat_sel = gr.CheckboxGroup(categories,label="Catégories")
292
+
293
+ slider = gr.Slider(1, 10, value=7, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
294
+ slider_p = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
295
+
296
+ # TAB A propos
297
+ with gr.Tab("À propos", elem_classes="max-height other-tabs"):
298
+ with gr.Row():
299
+ with gr.Column(scale=1):
300
+ gr.Markdown(
301
+ ("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
302
+ "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
303
+
304
+
305
+ # # TAB Configuration
306
+ # with gr.Tab("Configuration"):
307
+ #
308
+ # with gr.Row(elem_id="config-row"):
309
+ # with gr.Column(scale=1):
310
+ #
311
+ # for pdfName in get_PDF_Names_from_GCP():
312
+ # gr.Markdown( pdfName, elem_classes="a-propos")
313
+
314
+ def start_chat(query, history):
315
+
316
+ history = history + [(query, None)]
317
+ history = [tuple(x) for x in history]
318
+ return (gr.update(interactive=False), gr.update(selected=1), history)
319
+
320
+ def finish_chat():
321
+ return (gr.update(interactive=True, value=""))
322
+
323
+ (textbox
324
+ .submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
325
+ .then(chat, [textbox, chatbot, cat_sel, slider, slider_p], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox")
326
+ .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
327
+ )
328
+
329
+
330
+
331
+ demo.queue()
332
+
333
+
334
+ demo.launch(allowed_paths=["assets/download.png",
335
+ "assets/logo4.png",
336
+ "assets/axionable.svg"],favicon_path="assets/logo4.png")
assets/Logo.png ADDED
assets/axionable.svg ADDED
assets/download.png ADDED
assets/logo4.png ADDED
climateqa/__init__.py ADDED
File without changes
climateqa/engine/__init__.py ADDED
File without changes
climateqa/engine/embeddings.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+
5
+ def get_embeddings_function(version = "v1.2"):
6
+
7
+ if version == "v1.2":
8
+
9
+ # https://huggingface.co/BAAI/bge-base-en-v1.5
10
+ # Best embedding model at a reasonable size at the moment (2023-11-22)
11
+ # model_name = "BAAI/bge-base-en-v1.5"
12
+
13
+ # https://huggingface.co/BAAI/bge-m3
14
+ # A better one from 2024-04
15
+ model_name = "BAAI/bge-m3"
16
+
17
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
18
+ print("Loading embeddings model: ", model_name)
19
+ embeddings_function = HuggingFaceBgeEmbeddings(
20
+ model_name=model_name,
21
+ encode_kwargs=encode_kwargs,
22
+ query_instruction="Represent this sentence for searching relevant passages: "
23
+ )
24
+
25
+ else:
26
+
27
+ embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
28
+
29
+ return embeddings_function
climateqa/engine/keywords.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List
3
+ from typing import Literal
4
+ from langchain_core.pydantic_v1 import BaseModel, Field
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_core.utils.function_calling import convert_to_openai_function
7
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
+
9
+ class KeywordsOutput(BaseModel):
10
+ """Analyzing the user query to get keywords for a search engine"""
11
+
12
+ keywords: list = Field(
13
+ description="""
14
+ Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers.
15
+
16
+ Example:
17
+ - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
18
+ - "How will El Nino be impacted by climate change" -> ["el nino"]
19
+ - "Is climate change a hoax" -> [Climate change","hoax"]
20
+ """
21
+ )
22
+
23
+
24
+ def make_keywords_chain(llm):
25
+
26
+ functions = [convert_to_openai_function(KeywordsOutput)]
27
+ llm_functions = llm.bind(functions = functions,function_call={"name":"KeywordsOutput"})
28
+
29
+ chain = llm_functions | JsonOutputFunctionsParser()
30
+ return chain
climateqa/engine/llm/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from climateqa.engine.llm.openai import get_llm as get_openai_llm
2
+
3
+
4
+ def get_llm(provider="openai", **kwargs):
5
+ if provider == "openai":
6
+ return get_openai_llm(**kwargs)
7
+ else:
8
+ raise ValueError(f"Unknown provider: {provider}")
climateqa/engine/llm/openai.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+ import os
3
+
4
+ try:
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+ except Exception:
8
+ pass
9
+ # gpt-3.5-turbo-0125
10
+
11
+
12
+ def get_llm(model="gpt-3.5-turbo", max_tokens=1024, temperature=0.0,
13
+ streaming=True, timeout=30, **kwargs):
14
+
15
+ llm = ChatOpenAI(
16
+ model=model,
17
+ api_key=os.environ.get("OPENAI_API_KEY", None),
18
+ max_tokens=max_tokens,
19
+ streaming=streaming,
20
+ temperature=temperature,
21
+ timeout=timeout,
22
+ **kwargs,
23
+ )
24
+
25
+ return llm
climateqa/engine/old/chains.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://python.langchain.com/docs/modules/chains/how_to/custom_chain
2
+ # Including reformulation of the question in the chain
3
+ import json
4
+
5
+ from langchain import PromptTemplate, LLMChain
6
+ from langchain.chains import RetrievalQAWithSourcesChain,QAWithSourcesChain
7
+ from langchain.chains import TransformChain, SequentialChain
8
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
9
+
10
+ from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
11
+ from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
12
+
13
+
14
+ def load_combine_documents_chain(llm):
15
+ prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
16
+ qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff",prompt = prompt)
17
+ return qa_chain
18
+
19
+ def load_qa_chain_with_docs(llm):
20
+ """Load a QA chain with documents.
21
+ Useful when you already have retrieved docs
22
+
23
+ To be called with this input
24
+
25
+ ```
26
+ output = chain({
27
+ "question":query,
28
+ "audience":"experts climate scientists",
29
+ "docs":docs,
30
+ "language":"English",
31
+ })
32
+ ```
33
+ """
34
+
35
+ qa_chain = load_combine_documents_chain(llm)
36
+ chain = QAWithSourcesChain(
37
+ input_docs_key = "docs",
38
+ combine_documents_chain = qa_chain,
39
+ return_source_documents = True,
40
+ )
41
+ return chain
42
+
43
+
44
+ def load_qa_chain_with_text(llm):
45
+
46
+ prompt = PromptTemplate(
47
+ template = answer_prompt,
48
+ input_variables=["question","audience","language","summaries"],
49
+ )
50
+ qa_chain = LLMChain(llm = llm,prompt = prompt)
51
+ return qa_chain
52
+
53
+
54
+ def load_qa_chain_with_retriever(retriever,llm):
55
+ qa_chain = load_combine_documents_chain(llm)
56
+
57
+ # This could be improved by providing a document prompt to avoid modifying page_content in the docs
58
+ # See here https://github.com/langchain-ai/langchain/issues/3523
59
+
60
+ answer_chain = CustomRetrievalQAWithSourcesChain(
61
+ combine_documents_chain = qa_chain,
62
+ retriever=retriever,
63
+ return_source_documents = True,
64
+ verbose = True,
65
+ fallback_answer="**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**",
66
+ )
67
+ return answer_chain
68
+
69
+
70
+ def load_climateqa_chain(retriever,llm_reformulation,llm_answer):
71
+
72
+ reformulation_chain = load_reformulation_chain(llm_reformulation)
73
+ answer_chain = load_qa_chain_with_retriever(retriever,llm_answer)
74
+
75
+ climateqa_chain = SequentialChain(
76
+ chains = [reformulation_chain,answer_chain],
77
+ input_variables=["query","audience"],
78
+ output_variables=["answer","question","language","source_documents"],
79
+ return_all = True,
80
+ verbose = True,
81
+ )
82
+ return climateqa_chain
83
+
climateqa/engine/old/chat.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LANGCHAIN IMPORTS
2
+ from langchain import PromptTemplate, LLMChain
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.chains import RetrievalQAWithSourcesChain
5
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
6
+
7
+
8
+ # CLIMATEQA
9
+ from climateqa.retriever import ClimateQARetriever
10
+ from climateqa.vectorstore import get_pinecone_vectorstore
11
+ from climateqa.chains import load_climateqa_chain
12
+
13
+
14
+ class ClimateQA:
15
+ def __init__(self,hf_embedding_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
16
+ show_progress_bar = False,batch_size = 1,max_tokens = 1024,**kwargs):
17
+
18
+ self.llm = self.get_llm(max_tokens = max_tokens,**kwargs)
19
+ self.embeddings_function = HuggingFaceEmbeddings(
20
+ model_name=hf_embedding_model,
21
+ encode_kwargs={"show_progress_bar":show_progress_bar,"batch_size":batch_size}
22
+ )
23
+
24
+
25
+
26
+ def get_vectorstore(self):
27
+ pass
28
+
29
+
30
+ def reformulate(self):
31
+ pass
32
+
33
+
34
+ def retrieve(self):
35
+ pass
36
+
37
+
38
+ def ask(self):
39
+ pass
climateqa/engine/old/custom_retrieval_chain.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import inspect
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from pydantic import Extra
6
+
7
+ from langchain.schema.language_model import BaseLanguageModel
8
+ from langchain.callbacks.manager import (
9
+ AsyncCallbackManagerForChainRun,
10
+ CallbackManagerForChainRun,
11
+ )
12
+ from langchain.chains.base import Chain
13
+ from langchain.prompts.base import BasePromptTemplate
14
+
15
+ from typing import Any, Dict, List
16
+
17
+ from langchain.callbacks.manager import (
18
+ AsyncCallbackManagerForChainRun,
19
+ CallbackManagerForChainRun,
20
+ )
21
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
22
+ from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain
23
+ from langchain.docstore.document import Document
24
+ from langchain.pydantic_v1 import Field
25
+ from langchain.schema import BaseRetriever
26
+
27
+ from langchain.chains import RetrievalQAWithSourcesChain
28
+
29
+
30
+ from langchain.chains.router.llm_router import LLMRouterChain
31
+
32
+ class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
33
+
34
+ fallback_answer:str = "No sources available to answer this question."
35
+
36
+ def _call(self,inputs,run_manager=None):
37
+ _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
38
+ accepts_run_manager = (
39
+ "run_manager" in inspect.signature(self._get_docs).parameters
40
+ )
41
+ if accepts_run_manager:
42
+ docs = self._get_docs(inputs, run_manager=_run_manager)
43
+ else:
44
+ docs = self._get_docs(inputs) # type: ignore[call-arg]
45
+
46
+
47
+ if len(docs) == 0:
48
+ answer = self.fallback_answer
49
+ sources = []
50
+ else:
51
+
52
+ answer = self.combine_documents_chain.run(
53
+ input_documents=docs, callbacks=_run_manager.get_child(), **inputs
54
+ )
55
+ answer, sources = self._split_sources(answer)
56
+
57
+ result: Dict[str, Any] = {
58
+ self.answer_key: answer,
59
+ self.sources_answer_key: sources,
60
+ }
61
+ if self.return_source_documents:
62
+ result["source_documents"] = docs
63
+ return result
climateqa/engine/prompts.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
3
+
4
+ reformulation_prompt_template = """
5
+ Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
6
+ ---
7
+ query: La technologie nous sauvera-t-elle ?
8
+ ->
9
+ 'question': 'Can technology help humanity mitigate the effects of climate change?',
10
+ 'language': 'French',
11
+ ---
12
+ query: what are our reserves in fossil fuel?
13
+ ->
14
+ 'question': 'What are the current reserves of fossil fuels and how long will they last?',
15
+ 'language': 'English',
16
+ ---
17
+ query: what are the main causes of climate change?
18
+ ->
19
+ 'question': 'What are the main causes of climate change in the last century?',
20
+ 'language': 'English'
21
+ ---
22
+
23
+ {format_instructions}
24
+
25
+ Reformulate the question in English and detect the language of the original message
26
+ Output the result as json with two keys "question" and "language"
27
+ query: {query}
28
+ ->
29
+ ```json
30
+ """
31
+
32
+ answer_prompt_without_docs_template = """
33
+ You are Clara, an AI Assistant created by Axionable. Your role is to explain climate-related questions using info from the documents.
34
+ Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
35
+
36
+ Guidelines:
37
+ - If it's a conversational question such as "hello", "who are you", ..., you can answer directly
38
+ - Start by explaining clearly that you could not find any passages to answer in the documents, but it can be because of the search engine, and not because it's not there. So your answer is based on your own knowledge and must be taken with great caution because it's AI generated and you prefer to use sources to answer.
39
+ - Also explain that the user should reformulate the question to be more specific to the documents
40
+ - Because your role is to explain climate science, you can never be skeptical about climate change.
41
+ - If the question is not related to environmental issues, never never answer it. Say it's not your role.
42
+ - Make paragraphs by starting new lines to make your answers more readable.
43
+
44
+ Question: {question}
45
+ Answer in {language}:
46
+ """
47
+
48
+
49
+
50
+ audience_prompts = {
51
+ "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
52
+ "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
53
+ "experts": "expert and climate scientists that are not afraid of technical terms",
54
+ }
55
+
56
+
57
+
58
+ answer_prompt_template_custom = """
59
+ You are Clara, an AI Assistant created by Axionable. You are given a question and extracted passages. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
60
+
61
+ Guidelines:
62
+ - If the passages have useful facts or numbers, use them in your answer.
63
+ - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the name of the document and page if you know it.
64
+ - Do not use the sentence 'Doc i says ...' to say where information came from.
65
+ - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
66
+ - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
67
+ - If it makes sense, use bullet points and lists to make your answers easier to understand.
68
+ - You do not need to use every passage. Only use the ones that help answer the question.
69
+ - If the documents do not have the information needed to answer the question, just say you do not have enough information.
70
+ - Consider by default that the question is about the past century unless it is specified otherwise.
71
+ - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
72
+
73
+ -----------------------
74
+ Passages:
75
+ {context}
76
+
77
+ -----------------------
78
+ Question: {question} - Explained to {audience}
79
+ Answer in {language} with the passages citations:
80
+ """
climateqa/engine/rag.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
6
+ from langchain_core.prompts.prompt import PromptTemplate
7
+ from langchain_core.prompts.base import format_document
8
+
9
+ from climateqa.engine.reformulation import make_reformulation_chain
10
+ from climateqa.engine.prompts import answer_prompt_template_custom,answer_prompt_without_docs_template
11
+ from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
12
+ from climateqa.engine.keywords import make_keywords_chain
13
+
14
+ DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
15
+ template="{page_content}")
16
+
17
+
18
+ def _combine_documents(
19
+ docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
20
+ ):
21
+
22
+ doc_strings = []
23
+
24
+ for i, doc in enumerate(docs):
25
+ chunk_type = "Doc"
26
+ if isinstance(doc, str):
27
+ doc_formatted = doc
28
+ else:
29
+ doc_formatted = format_document(doc, document_prompt)
30
+ doc_string = f"{chunk_type} {i+1}: " + doc_formatted
31
+ doc_string = doc_string.replace("\n", " ")
32
+ doc_strings.append(doc_string)
33
+
34
+ return sep.join(doc_strings)
35
+
36
+
37
+ def make_rag_chain(retriever, llm):
38
+
39
+ # Construct the prompt
40
+ prompt = ChatPromptTemplate.from_template(answer_prompt_template_custom)
41
+ prompt_without_docs = ChatPromptTemplate.from_template(
42
+ answer_prompt_without_docs_template)
43
+
44
+ # ------- CHAIN 0 - Reformulation
45
+ reformulation = make_reformulation_chain(llm)
46
+ reformulation = prepare_chain(reformulation, "reformulation")
47
+
48
+ # ------- Find all keywords from the reformulated query
49
+ keywords = make_keywords_chain(llm)
50
+ keywords = {"keywords": itemgetter("question") | keywords}
51
+ keywords = prepare_chain(keywords, "keywords")
52
+
53
+ # ------- CHAIN 1
54
+ # Retrieved documents
55
+ find_documents = {"docs": itemgetter(
56
+ "question") | retriever} | RunnablePassthrough()
57
+ find_documents = prepare_chain(find_documents, "find_documents")
58
+
59
+ # ------- CHAIN 2
60
+ # Construct inputs for the llm
61
+ input_documents = {
62
+ "context": lambda x: _combine_documents(x["docs"]),
63
+ **pass_values(["question", "audience", "language", "keywords"])
64
+ }
65
+
66
+ # ------- CHAIN 3
67
+ # Bot answer
68
+ llm_final = rename_chain(llm, "answer")
69
+
70
+ answer_with_docs = {
71
+ "answer": input_documents | prompt | llm_final | StrOutputParser(),
72
+ **pass_values(["question", "audience", "language", "query", "docs", "keywords"]),
73
+ }
74
+
75
+ answer_without_docs = {
76
+ "answer": prompt_without_docs | llm_final | StrOutputParser(),
77
+ **pass_values(["question", "audience", "language", "query", "docs", "keywords"]),
78
+ }
79
+
80
+ answer = RunnableBranch(
81
+ (lambda x: len(x["docs"]) > 0, answer_with_docs),
82
+ answer_with_docs,
83
+ )
84
+
85
+ # ------- FINAL CHAIN
86
+ # Build the final chain
87
+ rag_chain = reformulation | keywords | find_documents | answer
88
+
89
+ return rag_chain
90
+
91
+
92
+ def make_rag_papers_chain(llm):
93
+
94
+ #prompt = ChatPromptTemplate.from_template(papers_prompt_template)
95
+
96
+ input_documents = {
97
+ "context": lambda x: _combine_documents(x["docs"]),
98
+ **pass_values(["question", "language"])
99
+ }
100
+
101
+ chain = input_documents | llm | StrOutputParser()
102
+ chain = rename_chain(chain,"answer")
103
+
104
+ return chain
105
+
106
+
107
+
108
+
109
+
110
+
111
+ def make_illustration_chain(llm):
112
+
113
+ # prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
114
+
115
+ input_description_images = {
116
+ "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
117
+ **pass_values(["question","audience","language","answer"]),
118
+ }
119
+
120
+ illustration_chain = input_description_images | llm | StrOutputParser()
121
+ return illustration_chain
climateqa/engine/reformulation.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain.output_parsers.structured import StructuredOutputParser, ResponseSchema
3
+ from langchain_core.prompts import PromptTemplate
4
+ from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
5
+
6
+ from climateqa.engine.prompts import reformulation_prompt_template
7
+ from climateqa.engine.utils import pass_values, flatten_dict
8
+
9
+
10
+ response_schemas = [
11
+ ResponseSchema(name="language", description="The detected language of the input message"),
12
+ ResponseSchema(name="question", description="The reformulated question always in English")
13
+ ]
14
+ output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
15
+ format_instructions = output_parser.get_format_instructions()
16
+
17
+ def fallback_default_values(x):
18
+ if x["question"] is None:
19
+ x["question"] = x["query"]
20
+ x["language"] = "english"
21
+
22
+ return x
23
+
24
+ def make_reformulation_chain(llm):
25
+
26
+ prompt = PromptTemplate(
27
+ template=reformulation_prompt_template,
28
+ input_variables=["query"],
29
+ partial_variables={"format_instructions": format_instructions}
30
+ )
31
+
32
+ chain = (prompt | llm.bind(stop=["```"]) | output_parser)
33
+
34
+ reformulation_chain = (
35
+ {"reformulation":chain,**pass_values(["query"])}
36
+ | RunnablePassthrough()
37
+ | flatten_dict
38
+ | fallback_default_values
39
+ )
40
+
41
+
42
+ return reformulation_chain
climateqa/engine/retriever.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/langchain-ai/langchain/issues/8623
2
+
3
+ import pandas as pd
4
+
5
+ from langchain_core.retrievers import BaseRetriever
6
+ from langchain_core.vectorstores import VectorStoreRetriever
7
+ from langchain_core.documents.base import Document
8
+ from langchain_core.vectorstores import VectorStore
9
+ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
10
+
11
+ from typing import List
12
+ from pydantic import Field
13
+
14
+ class ClimateQARetriever(BaseRetriever):
15
+ vectorstore:VectorStore
16
+ sources:list = ["IPCC","IPBES","IPOS"]
17
+ reports:list = []
18
+ threshold:float = 0.6
19
+ k_summary:int = 3
20
+ k_total:int = 10
21
+ namespace:str = "vectors",
22
+ min_size:int = 200,
23
+
24
+
25
+ def _get_relevant_documents(
26
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
27
+ ) -> List[Document]:
28
+
29
+ # Check if all elements in the list are either IPCC or IPBES
30
+ assert isinstance(self.sources,list)
31
+ assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
32
+ assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
33
+
34
+ # Prepare base search kwargs
35
+ filters = {}
36
+
37
+ if len(self.reports) > 0:
38
+ filters["short_name"] = {"$in":self.reports}
39
+ else:
40
+ filters["source"] = { "$in":self.sources}
41
+
42
+ # Search for k_summary documents in the summaries dataset
43
+ filters_summaries = {
44
+ **filters,
45
+ "report_type": { "$in":["SPM"]},
46
+ }
47
+
48
+ #build with pinecone
49
+ #docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
50
+ docs_summaries = self.vectorstore.similarity_search_with_score(query=query, k=self.k_summary)
51
+ docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
52
+
53
+ # Search for k_total - k_summary documents in the full reports dataset
54
+ filters_full = {
55
+ **filters,
56
+ "report_type": { "$nin":["SPM"]},
57
+ }
58
+ k_full = self.k_total - len(docs_summaries)
59
+ #docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
60
+ docs_full = self.vectorstore.similarity_search_with_score(query=query,k = k_full)
61
+
62
+ # Concatenate documents
63
+ docs = docs_summaries + docs_full
64
+
65
+ # Filter if scores are below threshold
66
+ docs = [x for x in docs if len(x[0].page_content) > self.min_size]
67
+ # docs = [x for x in docs if x[1] > self.threshold]
68
+
69
+ # Add score to metadata
70
+ results = []
71
+ for i,(doc,score) in enumerate(docs):
72
+ doc.metadata["similarity_score"] = score
73
+ doc.metadata["content"] = doc.page_content
74
+ doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
75
+ # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
76
+ results.append(doc)
77
+
78
+ # Sort by score
79
+ # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
80
+
81
+ return results
82
+
83
+
84
+
85
+
86
+ # def filter_summaries(df,k_summary = 3,k_total = 10):
87
+ # # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
88
+
89
+ # # # Filter by source
90
+ # # if source == "IPCC":
91
+ # # df = df.loc[df["source"]=="IPCC"]
92
+ # # elif source == "IPBES":
93
+ # # df = df.loc[df["source"]=="IPBES"]
94
+ # # else:
95
+ # # pass
96
+
97
+ # # Separate summaries and full reports
98
+ # df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
99
+ # df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
100
+
101
+ # # Find passages from summaries dataset
102
+ # passages_summaries = df_summaries.head(k_summary)
103
+
104
+ # # Find passages from full reports dataset
105
+ # passages_fullreports = df_full.head(k_total - len(passages_summaries))
106
+
107
+ # # Concatenate passages
108
+ # passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
109
+ # return passages
110
+
111
+
112
+
113
+
114
+ # def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
115
+ # assert max_k > k_total
116
+
117
+ # validated_sources = ["IPCC","IPBES"]
118
+ # sources = [x for x in sources if x in validated_sources]
119
+ # filters = {
120
+ # "source": { "$in": sources },
121
+ # }
122
+ # print(filters)
123
+
124
+ # # Retrieve documents
125
+ # docs = retriever.retrieve(query,top_k = max_k,filters = filters)
126
+
127
+ # # Filter by score
128
+ # docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
129
+
130
+ # if len(docs) == 0:
131
+ # return []
132
+ # res = pd.DataFrame(docs)
133
+ # passages_df = filter_summaries(res,k_summary,k_total)
134
+ # if as_dict:
135
+ # contents = passages_df["content"].tolist()
136
+ # meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
137
+ # passages = []
138
+ # for i in range(len(contents)):
139
+ # passages.append({"content":contents[i],"meta":meta[i]})
140
+ # return passages
141
+ # else:
142
+ # return passages_df
143
+
144
+
145
+
146
+ # def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
147
+
148
+
149
+ # print("hellooooo")
150
+
151
+ # # Reformulate queries
152
+ # reformulated_query,language = reformulate(query)
153
+
154
+ # print(reformulated_query)
155
+
156
+ # # Retrieve documents
157
+ # passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
158
+ # response = {
159
+ # "query":query,
160
+ # "reformulated_query":reformulated_query,
161
+ # "language":language,
162
+ # "sources":passages,
163
+ # "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
164
+ # }
165
+ # return response
166
+
climateqa/engine/text_retriever.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.retrievers import BaseRetriever
2
+ from langchain_core.documents.base import Document
3
+ from langchain_core.vectorstores import VectorStore
4
+ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
5
+ from typing import List
6
+
7
+ class ClimateQARetriever(BaseRetriever):
8
+ vectorstore: VectorStore
9
+ sources: list = []
10
+ reports:list = []
11
+ threshold: float = 0.01
12
+ k_summary: int = 3
13
+ k_total: int = 7
14
+ min_size: int = 200
15
+ filter: dict = None
16
+
17
+ def _get_relevant_documents(
18
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
19
+ ) -> List[Document]:
20
+
21
+ # Check if all elements in the list are either IPCC or IPBES
22
+ assert isinstance(self.sources,list)
23
+ # assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
24
+
25
+ # Prepare base search kwargs
26
+ filters = {}
27
+
28
+ filters["source"] = { "$in":self.sources}
29
+
30
+ docs = self.vectorstore.similarity_search_with_score(query=query,k=self.k_total, filter=self.filter)
31
+
32
+ # Add score to metadata
33
+ results = []
34
+ for i, (doc, score) in enumerate(docs):
35
+ # filtre les sources sous le seuil
36
+ if score < self.threshold:
37
+ continue
38
+ doc.metadata["similarity_score"] = score
39
+ doc.metadata["content"] = doc.page_content
40
+ doc.metadata["chunk_type"] = "text"
41
+ doc.metadata["page_number"] = 1
42
+ results.append(doc)
43
+ return results
44
+
climateqa/engine/utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ from typing import Any, Dict, Iterable, Tuple
3
+ from langchain_core.runnables import RunnablePassthrough
4
+
5
+
6
+ def pass_values(x):
7
+ if not isinstance(x, list):
8
+ x = [x]
9
+ return {k: itemgetter(k) for k in x}
10
+
11
+
12
+ def prepare_chain(chain,name):
13
+ chain = propagate_inputs(chain)
14
+ chain = rename_chain(chain,name)
15
+ return chain
16
+
17
+
18
+ def propagate_inputs(chain):
19
+ chain_with_values = {
20
+ "outputs": chain,
21
+ "inputs": RunnablePassthrough()
22
+ } | RunnablePassthrough() | flatten_dict
23
+ return chain_with_values
24
+
25
+ def rename_chain(chain,name):
26
+ return chain.with_config({"run_name":name})
27
+
28
+
29
+ # Drawn from langchain utils and modified to remove the parent key
30
+ def _flatten_dict(
31
+ nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
32
+ ) -> Iterable[Tuple[str, Any]]:
33
+ """
34
+ Generator that yields flattened items from a nested dictionary for a flat dict.
35
+
36
+ Parameters:
37
+ nested_dict (dict): The nested dictionary to flatten.
38
+ parent_key (str): The prefix to prepend to the keys of the flattened dict.
39
+ sep (str): The separator to use between the parent key and the key of the
40
+ flattened dictionary.
41
+
42
+ Yields:
43
+ (str, any): A key-value pair from the flattened dictionary.
44
+ """
45
+ for key, value in nested_dict.items():
46
+ new_key = key
47
+ if isinstance(value, dict):
48
+ yield from _flatten_dict(value, new_key, sep)
49
+ else:
50
+ yield new_key, value
51
+
52
+
53
+ def flatten_dict(
54
+ nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
55
+ ) -> Dict[str, Any]:
56
+ """Flattens a nested dictionary into a flat dictionary.
57
+
58
+ Parameters:
59
+ nested_dict (dict): The nested dictionary to flatten.
60
+ parent_key (str): The prefix to prepend to the keys of the flattened dict.
61
+ sep (str): The separator to use between the parent key and the key of the
62
+ flattened dictionary.
63
+
64
+ Returns:
65
+ (dict): A flat dictionary.
66
+
67
+ """
68
+ flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
69
+ return flat_dict
climateqa/engine/vectorstore.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from google.cloud import storage
3
+ import os
4
+
5
+ with open("./cred.json","w") as fj:
6
+ fj.write(os.environ["CRED_JSON"])
7
+
8
+ storage_client = storage.Client()
9
+
10
+ bucket_name = "docs-axio-clara"
11
+
12
+ from langchain_pinecone import PineconeVectorStore
13
+
14
+ from langchain_community.document_loaders import TextLoader
15
+ from langchain_text_splitters import CharacterTextSplitter
16
+ from climateqa.engine.embeddings import get_embeddings_function
17
+ embeddings_function = get_embeddings_function()
18
+
19
+
20
+
21
+ index_name = "clara-index"
22
+ namespace = "my-namespace"
23
+
24
+
25
+ import os
26
+ import pdfplumber
27
+
28
+
29
+ def get_categories_files():
30
+
31
+ finale = {}
32
+ listCat = []
33
+
34
+ CAT_DIR="config_categorie/"
35
+ FOLDER_PATH="."
36
+
37
+ bucket = storage_client.get_bucket(bucket_name)
38
+
39
+ blob = bucket.blob(CAT_DIR+"categories.csv")
40
+ lines = blob.download_as_text().split("\n")
41
+
42
+ blob_label = bucket.blob(CAT_DIR+"libelle.csv")
43
+ lines_label = blob_label.download_as_text().split("\n")
44
+
45
+ labels = {}
46
+ # récupération des libelles
47
+ first = True
48
+ for line in lines_label:
49
+ # evite la première ligne
50
+ if first:
51
+ first = False
52
+ continue
53
+ lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","")
54
+ labels[line.split(";")[0]] = lab
55
+ print( "label :"+lab )
56
+
57
+ # premier passage récupération des catégories existantes
58
+ first = True
59
+ for line in lines:
60
+ # evite la première ligne
61
+ if first:
62
+ first = False
63
+ continue
64
+ categories = line.split(";")[-1].split(" ")
65
+
66
+ for cat in categories:
67
+ categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
68
+
69
+ # si la categorie n'a pas de label on utilise le champ technique
70
+ try :
71
+ test = labels[categ] # plante si la clé n'exsite pas
72
+ except :
73
+ labels[categ] = categ
74
+
75
+ # on ajoute la catégorie (le label) dans la liste si pas déjà croisée
76
+ if not labels[categ] in listCat:
77
+ print(" - ["+categ+"] > "+ labels[categ] )
78
+ listCat.append(labels[categ])
79
+
80
+ # initialisation de la structure finale
81
+ for cat in listCat:
82
+ finale[cat] = []
83
+ finale["AllCat"] = listCat
84
+
85
+ # deuxième passage association fichier, catégorie
86
+ first = True
87
+ for line in lines:
88
+ # evite la première ligne
89
+ if first:
90
+ first = False
91
+ continue
92
+ fichier = line.split(";")[0]
93
+ categories = line.split(";")[-1].split(" ")
94
+ listCat = []
95
+
96
+ # on place le fichier dans les catégories associées
97
+ for cat in categories:
98
+ categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
99
+ print( fichier +" dans "+ labels[categ] +"("+categ+")")
100
+ finale[labels[categ]].append(fichier)
101
+
102
+ return finale
103
+
104
+ def get_PDF_Names_from_GCP():
105
+
106
+ listName = []
107
+ # Récupération des fichier depuis GCP storage
108
+ blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
109
+ for blob in blobs:
110
+ listName.append(blob.name)
111
+ return listName
112
+
113
+ def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
114
+
115
+ # Récupération des fichier depuis GCP storage
116
+ #blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
117
+ #for blob in blobs:
118
+
119
+ # print( "\n"+blob.name+":")
120
+ # print( " <- Téléchargement Depuis GCP")
121
+ # blob.download_to_filename(pdf_folder+"/"+blob.name)
122
+
123
+ # Extraction des textes dpuis les fichiers PDF
124
+ print(" >>> Extraction PDF")
125
+ for pdf_file in os.listdir(pdf_folder):
126
+ if pdf_file.startswith("."):
127
+ continue
128
+ print(" > "+pdf_folder+"/"+pdf_file)
129
+ pdf_total_pages = 0
130
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
131
+ pdf_total_pages = len(pdf.pages)
132
+
133
+ # Fuite mémoire pour les gros fichiers
134
+ # Reouvrir le fichier à chaque N page semble rélgler le problème
135
+ N_page = 300
136
+ page_number = 0
137
+ while page_number < pdf_total_pages:
138
+
139
+ print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
140
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
141
+
142
+ npage = 0
143
+ while (npage < N_page and page_number < pdf_total_pages) :
144
+
145
+ print(" >>> "+str(page_number+1))
146
+ f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
147
+ for char_pdf in pdf.pages[page_number].chars:
148
+ f.write(char_pdf["text"])
149
+ f.close()
150
+
151
+ npage = npage + 1
152
+ page_number = page_number + 1
153
+
154
+
155
+ print(" X removing: " + blob.name )
156
+ os.remove(pdf_folder+"/"+blob.name)
157
+
158
+
159
+ def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
160
+
161
+ vectorstore = PineconeVectorStore(
162
+ index_name=index_name,
163
+ embedding=embeddings_function,
164
+ #namespace=namespace
165
+ )
166
+ print(" Vectorisation ...")
167
+ return vectorstore
168
+
169
+
170
+ print("MISSING VECTORS")
171
+ exit(0)
climateqa/engine/vectorstore_annoy.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from google.cloud import storage
3
+ #storage_client = storage.Client()
4
+ storage_client = storage.Client.create_anonymous_client()
5
+ bucket_name = "docs-axio-clara"
6
+
7
+
8
+ from langchain_community.vectorstores import Annoy
9
+
10
+ from langchain_community.document_loaders import TextLoader
11
+ from langchain_text_splitters import CharacterTextSplitter
12
+ from climateqa.engine.embeddings import get_embeddings_function
13
+ embeddings_function = get_embeddings_function()
14
+
15
+
16
+ import os
17
+ import pdfplumber
18
+
19
+ def get_PDF_Names_from_GCP():
20
+
21
+ listName = []
22
+ # Récupération des fichier depuis GCP storage
23
+ blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
24
+ for blob in blobs:
25
+ listName.append(blob.name)
26
+ return listName
27
+
28
+ def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
29
+
30
+ # Récupération des fichier depuis GCP storage
31
+ blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
32
+ for blob in blobs:
33
+
34
+ print( "\n"+blob.name+":")
35
+ print( " <- Téléchargement Depuis GCP")
36
+ blob.download_to_filename(pdf_folder+"/"+blob.name)
37
+
38
+ # Extraction des textes dpuis les fichiers PDF
39
+ print(" >>> Extraction PDF")
40
+ for pdf_file in os.listdir(pdf_folder):
41
+ if pdf_file.startswith("."):
42
+ continue
43
+ print(" > "+pdf_folder+"/"+pdf_file)
44
+ pdf_total_pages = 0
45
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
46
+ pdf_total_pages = len(pdf.pages)
47
+
48
+ # Fuite mémoire pour les gros fichiers
49
+ # Reouvrir le fichier à chaque N page semble rélgler le problème
50
+ N_page = 300
51
+ page_number = 0
52
+ while page_number < pdf_total_pages:
53
+
54
+ print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
55
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
56
+
57
+ npage = 0
58
+ while (npage < N_page and page_number < pdf_total_pages) :
59
+
60
+ print(" >>> "+str(page_number+1))
61
+ f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
62
+ for char_pdf in pdf.pages[page_number].chars:
63
+ f.write(char_pdf["text"])
64
+ f.close()
65
+
66
+ npage = npage + 1
67
+ page_number = page_number + 1
68
+
69
+
70
+ print(" X removing: " + blob.name )
71
+ os.remove(pdf_folder+"/"+blob.name)
72
+
73
+
74
+ def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
75
+
76
+ if os.path.isfile(vectors_path+"/index.annoy"):
77
+ return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
78
+
79
+ try:
80
+ os.mkdir(vectors_path)
81
+ except:
82
+ pass
83
+
84
+ try:
85
+ # Récupération des fichier depuis GCP storage
86
+ blobs = storage_client.list_blobs(bucket_name, prefix='testvectors/')
87
+ for blob in blobs:
88
+
89
+ print( "\n"+blob.name.split("/")[-1]+":")
90
+ print( " <- Téléchargement Depuis GCP")
91
+ blob.download_to_filename(vectors_path+"/"+blob.name.split("/")[-1])
92
+ except:
93
+ pass
94
+
95
+ # TODO A FUNCTION FOR THAT TO AVOID CODE DUPLICATION
96
+ if os.path.isfile(vectors_path+"/index.annoy"):
97
+ return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
98
+
99
+ print("MISSING VECTORS")
100
+ exit(0)
101
+
102
+ # get_PDF_from_GCP(folder_path, pdf_folder)
103
+
104
+ # print(" Vectorisation ...")
105
+
106
+ # docs = []
107
+ # vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
108
+ # for filename in os.listdir(folder_path):
109
+ # if filename.startswith("."):
110
+ # continue
111
+ # file_path = os.path.join(folder_path, filename)
112
+ # if os.path.isfile(file_path):
113
+ # loader = TextLoader(file_path)
114
+ # documents = loader.load()
115
+ #
116
+ # for doc in documents:
117
+ # if (doc.metadata):
118
+ # doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
119
+ # doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
120
+ # doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
121
+ #
122
+ # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
123
+ # docs += text_splitter.split_documents(documents)
124
+ # vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
125
+ # vector_store_from_docs.save_local(vectors_path)
126
+ # return vector_store_from_docs
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+ # Pinecone
135
+ # More info at https://docs.pinecone.io/docs/langchain
136
+ # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
137
+ #import os
138
+ #from pinecone import Pinecone
139
+ #from langchain_community.vectorstores import Pinecone as PineconeVectorstore
140
+
141
+ # LOAD ENVIRONMENT VARIABLES
142
+ #try:
143
+ # from dotenv import load_dotenv
144
+ # load_dotenv()
145
+ #except:
146
+ # pass
147
+
148
+
149
+ #def get_pinecone_vectorstore(embeddings,text_key = "content"):
150
+
151
+ # # initialize pinecone
152
+ # pinecone.init(
153
+ # api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
154
+ # environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
155
+ # )
156
+
157
+ # index_name = os.getenv("PINECONE_API_INDEX")
158
+ # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
159
+
160
+ # return vectorstore
161
+
162
+ # pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
163
+ # index = pc.Index(os.getenv("PINECONE_API_INDEX"))
164
+
165
+ # vectorstore = PineconeVectorstore(
166
+ # index, embeddings, text_key,
167
+ # )
168
+ # return vectorstore
169
+
170
+
171
+
172
+ # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
173
+
174
+ # assert isinstance(sources,list)
175
+
176
+ # # Check if all elements in the list are either IPCC or IPBES
177
+ # filter = {
178
+ # "source": { "$in":sources},
179
+ # }
180
+
181
+ # retriever = vectorstore.as_retriever(search_kwargs={
182
+ # "k": k,
183
+ # "namespace":"vectors",
184
+ # "filter":filter
185
+ # })
186
+
187
+ # return retriever
logs/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ google-cloud-storage==2.16.0
2
+ gradio==4.19.1
3
+ python-dotenv==1.0.0
4
+ langchain==0.1.10
5
+ langchain_openai==0.0.6
6
+ pinecone-client==3.0.2
7
+ sentence-transformers==2.6.0
8
+ huggingface-hub
9
+ msal
10
+ pyalex==0.13
11
+ networkx==3.2.1
12
+ pyvis==0.3.2
13
+ annoy==1.17.3
14
+ langchain_pinecone
15
+ pdfplumber==0.11.0
setup.py ADDED
@@ -0,0 +1 @@
 
 
1
+ print("yoooooo")
style.css ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* :root {
3
+ --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
+ } */
5
+
6
+ .fordataonly {
7
+ display:none !important
8
+ }
9
+
10
+
11
+ label {
12
+ color: #000000 !important;
13
+ }
14
+
15
+ strong {
16
+ color: #888888 !important;
17
+ }
18
+
19
+ .logo-axio {
20
+ float: right;
21
+ position: absolute;
22
+ right: 0px;
23
+ }
24
+
25
+
26
+ /* couleur text */
27
+ p {
28
+ color: black !important;
29
+ }
30
+ li {
31
+ color: black !important;
32
+ }
33
+
34
+ button.selected {
35
+ border-radius: 20px !important;
36
+ }
37
+ button:hover {
38
+ color: #ffc000 !important;
39
+ }
40
+
41
+
42
+ /* fond panels/blocks */
43
+ .panel {
44
+ background-color: #eeeeee !important;
45
+ border: 0px;
46
+ }
47
+ .block {
48
+ background-color: #eeeeee !important;
49
+ }
50
+
51
+ /* fond bot */
52
+ .bot {
53
+ background-color: #eeeeee !important;
54
+ }
55
+
56
+ /* avatar en debut de reponse */
57
+ .avatar-container {
58
+ align-self: baseline !important;
59
+ margin-top: 35px;
60
+ }
61
+
62
+
63
+
64
+ /* fond user */
65
+ .user {
66
+ background-color: #d2d2d2 !important;
67
+ }
68
+ textarea {
69
+ background-color: #d2d2d2 !important;
70
+ color: black !important;
71
+ }
72
+
73
+
74
+ /* fond app */
75
+ gradio-app {
76
+ background-color: #ffffff !important;
77
+ }
78
+ .gradio-container {
79
+ background-color: #ffffff !important;
80
+ max-width: 100% !important;
81
+ width: 100% !important;
82
+ }
83
+
84
+
85
+ .a-propos {
86
+ margin: 20px !important;
87
+ }
88
+
89
+
90
+
91
+ .telecharger {
92
+ border: 1px solid;
93
+ padding: 5px;
94
+ border-radius: 5px;
95
+ background-color: #ffc000;
96
+ color: #fff;
97
+ margin-left: 5px;
98
+ }
99
+
100
+ .warning-box {
101
+ background-color: #fff3cd;
102
+ border: 1px solid #ffeeba;
103
+ border-radius: 4px;
104
+ padding: 15px 20px;
105
+ font-size: 14px;
106
+ color: #856404;
107
+ display: inline-block;
108
+ margin-bottom: 15px;
109
+ }
110
+
111
+
112
+ .tip-box {
113
+ background-color: #f7dd8f;
114
+ border: 1px solid #FFC000;
115
+ border-radius: 4px;
116
+ margin-top:20px;
117
+ padding: 15px 20px;
118
+ font-size: 14px;
119
+ display: inline-block;
120
+ margin-bottom: 15px;
121
+ width: auto;
122
+ color:black !important;
123
+ }
124
+
125
+ body.dark .warning-box * {
126
+ color:black !important;
127
+ }
128
+
129
+
130
+ body.dark .tip-box * {
131
+ color:rgb(216, 216, 216) !important;
132
+ }
133
+
134
+
135
+ .tip-box-title {
136
+ font-weight: bold;
137
+ font-size: 14px;
138
+ margin-bottom: 5px;
139
+ }
140
+
141
+ .light-bulb {
142
+ display: inline;
143
+ margin-right: 5px;
144
+ }
145
+
146
+ .gr-box {border-color: #d6c37c}
147
+
148
+ #hidden-message{
149
+ display:none;
150
+ }
151
+
152
+ .message{
153
+ font-size:14px !important;
154
+ }
155
+
156
+
157
+ a {
158
+ text-decoration: none;
159
+ color: inherit;
160
+ }
161
+
162
+ .card {
163
+ background-color: white;
164
+ border-radius: 10px;
165
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
166
+ overflow: hidden;
167
+ display: flex;
168
+ flex-direction: column;
169
+ margin:20px;
170
+ }
171
+
172
+ .card-content {
173
+ padding: 20px;
174
+ }
175
+
176
+ .card-content h2 {
177
+ font-size: 14px !important;
178
+ font-weight: bold;
179
+ margin-bottom: 10px;
180
+ margin-top:0px !important;
181
+ color:#FFC000!important;;
182
+ }
183
+
184
+ .card-content p {
185
+ font-size: 12px;
186
+ margin-bottom: 0;
187
+ color: black;
188
+ }
189
+
190
+ .card-footer {
191
+ background-color: #f4f4f4;
192
+ font-size: 10px;
193
+ padding: 10px;
194
+ display: flex;
195
+ justify-content: space-between;
196
+ align-items: center;
197
+ }
198
+
199
+ .card-footer span {
200
+ flex-grow: 1;
201
+ text-align: left;
202
+ color: #999 !important;
203
+ }
204
+
205
+ .pdf-link {
206
+ display: inline-flex;
207
+ align-items: center;
208
+ margin-left: auto;
209
+ text-decoration: none!important;
210
+ font-size: 14px;
211
+ }
212
+
213
+
214
+
215
+ .message.user{
216
+ /* background-color:#7494b0 !important; */
217
+ border:none;
218
+ /* color:white!important; */
219
+ }
220
+
221
+ .message.bot{
222
+ /* background-color:#f2f2f7 !important; */
223
+ border:none;
224
+ }
225
+
226
+ /* .gallery-item > div:hover{
227
+ background-color:#7494b0 !important;
228
+ color:white!important;
229
+ }
230
+
231
+ .gallery-item:hover{
232
+ border:#7494b0 !important;
233
+ }
234
+
235
+ .gallery-item > div{
236
+ background-color:white !important;
237
+ color:#577b9b!important;
238
+ }
239
+
240
+ .label{
241
+ color:#577b9b!important;
242
+ } */
243
+
244
+ /* .paginate{
245
+ color:#577b9b!important;
246
+ } */
247
+
248
+
249
+
250
+ /* span[data-testid="block-info"]{
251
+ background:none !important;
252
+ color:#577b9b;
253
+ } */
254
+
255
+ /* Pseudo-element for the circularly cropped picture */
256
+ /* .message.bot::before {
257
+ content: '';
258
+ position: absolute;
259
+ top: -10px;
260
+ left: -10px;
261
+ width: 30px;
262
+ height: 30px;
263
+ background-image: var(--user-image);
264
+ background-size: cover;
265
+ background-position: center;
266
+ border-radius: 50%;
267
+ z-index: 10;
268
+ }
269
+ */
270
+
271
+ label.selected{
272
+ background:none !important;
273
+ }
274
+
275
+ #submit-button{
276
+ padding:0px !important;
277
+ }
278
+
279
+
280
+ @media screen and (min-width: 1024px) {
281
+ div#tab-examples{
282
+ height:calc(100vh - 190px) !important;
283
+ overflow-y: auto;
284
+ }
285
+
286
+ div#sources-textbox{
287
+ height:calc(100vh - 190px) !important;
288
+ overflow-y: auto !important;
289
+ }
290
+
291
+ div#tab-config{
292
+ height:calc(100vh - 190px) !important;
293
+ overflow-y: auto !important;
294
+ }
295
+
296
+ div#chatbot-row{
297
+ height:calc(100vh - 90px) !important;
298
+ }
299
+
300
+ div#chatbot{
301
+ height:calc(100vh - 170px) !important;
302
+ }
303
+
304
+ .max-height{
305
+ height:calc(100vh - 90px) !important;
306
+ overflow-y: auto;
307
+ }
308
+
309
+ /* .tabitem:nth-child(n+3) {
310
+ padding-top:30px;
311
+ padding-left:40px;
312
+ padding-right:40px;
313
+ } */
314
+ }
315
+
316
+ footer {
317
+ visibility: hidden;
318
+ display:none !important;
319
+ }
320
+
321
+
322
+ @media screen and (max-width: 767px) {
323
+ /* Your mobile-specific styles go here */
324
+
325
+ div#chatbot{
326
+ height:500px !important;
327
+ }
328
+
329
+ #submit-button{
330
+ padding:0px !important;
331
+ min-width: 80px;
332
+ }
333
+
334
+ /* This will hide all list items */
335
+ div.tab-nav button {
336
+ display: none !important;
337
+ color: #ffc000;
338
+ }
339
+
340
+ /* This will show only the first list item */
341
+ div.tab-nav button:first-child {
342
+ display: block !important;
343
+ }
344
+
345
+ /* This will show only the first list item */
346
+ div.tab-nav button:nth-child(2) {
347
+ display: block !important;
348
+ }
349
+
350
+ #right-panel button{
351
+ display: block !important;
352
+ }
353
+
354
+ /* ... add other mobile-specific styles ... */
355
+ }
356
+
357
+
358
+ body.dark .card{
359
+ background-color: #c7c7c7;
360
+ }
361
+
362
+ body.dark .card-content h2{
363
+ color:#f4dbd3 !important;
364
+ }
365
+
366
+ body.dark .card-footer {
367
+ background-color: #404652;
368
+ }
369
+
370
+ body.dark .card-footer span {
371
+ color:white !important;
372
+ }
373
+
374
+
375
+ .doc-ref{
376
+ color:#ffc000!important;
377
+ margin-right:1px;
378
+ }
379
+
380
+ .tabitem{
381
+ border:none !important;
382
+ }
383
+
384
+ .other-tabs > div{
385
+ padding-left:40px;
386
+ padding-right:40px;
387
+ padding-top:10px;
388
+ }
389
+
390
+ .gallery-item > div{
391
+ white-space: normal !important; /* Allow the text to wrap */
392
+ word-break: break-word !important; /* Break words to prevent overflow */
393
+ overflow-wrap: break-word !important; /* Break long words if necessary */
394
+ }
395
+
396
+ span.chatbot > p > img{
397
+ margin-top:40px !important;
398
+ max-height: none !important;
399
+ max-width: 80% !important;
400
+ border-radius:0px !important;
401
+ }
402
+
403
+
404
+ .chatbot-caption{
405
+ font-size:11px;
406
+ font-style:italic;
407
+ color:#ffc000;
408
+ }
409
+
410
+ .ai-generated{
411
+ font-size:11px!important;
412
+ font-style:italic;
413
+ color:#ffc000 !important;
414
+ }
415
+
416
+ .card-image > .card-content{
417
+ background-color:#f1f7fa !important;
418
+ }
419
+
420
+
421
+
422
+ .tab-nav > button.selected{
423
+ color:#ffc000;
424
+ font-weight:bold;
425
+ border:none;
426
+ }
427
+
428
+ .tab-nav{
429
+ border:none !important;
430
+ }
431
+
432
+ #input-textbox > label > textarea{
433
+ border-radius:40px;
434
+ padding-left:30px;
435
+ resize:none;
436
+ }
437
+
438
+ #input-message > div{
439
+ border:none;
440
+ }
441
+
442
+ #dropdown-samples{
443
+ /*! border:none !important; */
444
+ /*! border-width:0px !important; */
445
+ background:none !important;
446
+
447
+ }
448
+
449
+ #dropdown-samples > .container > .wrap{
450
+ background-color:white;
451
+ }
452
+
453
+
454
+ #tab-examples > div > .form{
455
+ border:none;
456
+ background:none !important;
457
+ }
458
+
459
+ .a-doc-ref{
460
+ text-decoration: none !important;
461
+ color:#FFC000;
462
+ }
test ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /src
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
+ PYTHONUNBUFFERED=1 \
18
+ GRADIO_ALLOW_FLAGGING=never \
19
+ GRADIO_NUM_PORTS=1 \
20
+ GRADIO_SERVER_NAME=0.0.0.0 \
21
+ GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces
23
+
24
+ # Set the working directory to the user's home directory
25
+ WORKDIR $HOME/app
26
+
27
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
+ COPY --chown=user . $HOME/app
29
+
30
+ CMD ["python","setup.py"]
31
+
32
+ CMD ["python", "app.py"]
utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import random
3
+ import string
4
+ import uuid
5
+
6
+
7
+ def create_user_id():
8
+ """Create user_id
9
+ str: String to id user
10
+ """
11
+ user_id = str(uuid.uuid4())
12
+ return user_id
vectors/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
vectors/index.annoy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b94e9d486dbe3a9e2397672bda1d1c17198cca42a53afaa16ef8ecfcebd22fc9
3
+ size 2238984
vectors/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eb3d63539603642200f07f8fac2e290e94104fbbe4f4471dc663eff850263f6
3
+ size 3223915