Spaces:
Running
Running
Ilyas KHIAT
commited on
Commit
·
ef73c14
1
Parent(s):
f1342ba
multipage et ux ++
Browse files- agents_page/recommended_agent.py +68 -2
- audit_page/audit.py +123 -120
- requirements.txt +1 -0
- utils/audit/audit_audio.py +21 -3
- utils/audit/audit_doc.py +13 -39
- utils/audit/response_llm.py +35 -0
agents_page/recommended_agent.py
CHANGED
@@ -1,5 +1,71 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
|
|
3 |
#st.set_page_config(page_title="Agents recommandés", page_icon="", layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
|
|
1 |
import streamlit as st
|
2 |
+
from utils.audit.response_llm import generate_response_via_langchain
|
3 |
+
from textwrap import dedent
|
4 |
#st.set_page_config(page_title="Agents recommandés", page_icon="", layout="wide")
|
5 |
+
def remove_images_from_content(content):
|
6 |
+
filtered_content = {}
|
7 |
+
|
8 |
+
for page, data in content.items():
|
9 |
+
# Create a new dictionary excluding the "images" key
|
10 |
+
filtered_data = {key: value for key, value in data.items() if key != "images"}
|
11 |
+
filtered_content[page] = filtered_data
|
12 |
+
|
13 |
+
return filtered_content
|
14 |
+
|
15 |
+
def recommended_agent_main():
|
16 |
+
st.title("Agents recommandés")
|
17 |
+
|
18 |
+
if "audit" not in st.session_state or "audit" is None:
|
19 |
+
st.error("Veuillez d'abord effectuer un audit pour obtenir des recommandations d'agents.")
|
20 |
+
return
|
21 |
+
|
22 |
+
audit = st.session_state.audit_simplified
|
23 |
+
content = st.session_state.audit["content"]
|
24 |
+
|
25 |
+
if "response_llm" not in st.session_state:
|
26 |
+
st.session_state.response_llm = ""
|
27 |
+
|
28 |
+
|
29 |
+
#filter content, delete images if type is pdf
|
30 |
+
if audit["type de fichier"] == "pdf":
|
31 |
+
content = remove_images_from_content(content)
|
32 |
+
#delete audio if type is audio and keep transcript
|
33 |
+
elif audit["type de fichier"] == "audio":
|
34 |
+
content = content["transcription"]
|
35 |
+
|
36 |
+
ressources = content
|
37 |
+
|
38 |
+
prompt = '''Tu es designer en intelligence artificielle (IA) spécialisé dans la création d'agents IA autonomes et performants.
|
39 |
+
A partir de ressources fournies par l'utilisateur (texte, documents, images, audio), tu es chargé de suggérer la création d'agents autonomes pour mettre en pratique les informations contenues dans les ressources fournies.
|
40 |
+
|
41 |
+
Tu proposes deux solutions :
|
42 |
+
|
43 |
+
Sol. A : 1 seul agent IA dont tu suggéreras :
|
44 |
+
* Nom
|
45 |
+
* Rôle
|
46 |
+
* Objectifs
|
47 |
+
* Outils utilisés par l'agent
|
48 |
+
* Tâches réalisées par l'agents
|
49 |
+
* Compétences de l'agent (backstory)
|
50 |
+
|
51 |
+
Sol. B : 1 équipe d'agents tu suggéreras :
|
52 |
+
* Le nombre d'agents
|
53 |
+
* Pour chacune d'eux [Nom, Rôle, Objectifs, Outils utilisés par l'agent, Tâches réalisées par l'agents, Compétences de l'agent (backstory)]
|
54 |
+
|
55 |
+
Une fois ce travail réalisé, tu proposes une série de 3 missions avec objectifs SMART pour chacun des agents Sol. A et Sol. B en présentation les résultats dans un tableau contenant :
|
56 |
+
Nom de l’agent
|
57 |
+
Objectifs à atteindre
|
58 |
+
'''
|
59 |
+
|
60 |
+
#display prompt and modify it
|
61 |
+
prompt_modified = st.text_area("Prompt", prompt, height=300)
|
62 |
+
prompt_modified = dedent(prompt_modified)
|
63 |
+
if st.button("Générer les recommandations"):
|
64 |
+
resource_prompt = f'''Ressources fournies par l'utilisateur :{ressources}'''
|
65 |
+
prompt_modified = f"{prompt_modified}\n{resource_prompt}"
|
66 |
+
st.session_state.response_llm = st.write_stream(generate_response_via_langchain(query=prompt_modified,stream=True))
|
67 |
+
elif st.session_state.response_llm:
|
68 |
+
st.info("la dernière réponse générée est affichée ci-dessous")
|
69 |
+
st.write(st.session_state.response_llm)
|
70 |
|
71 |
+
recommended_agent_main()
|
audit_page/audit.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pymupdf as fitz
|
3 |
import pyperclip
|
4 |
-
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text
|
5 |
import dotenv
|
6 |
from utils.audit.audit_audio import evaluate_audio_quality
|
7 |
from PIL import Image
|
@@ -26,15 +26,16 @@ def classify_file(file):
|
|
26 |
|
27 |
#display content
|
28 |
def display_content_doc(content:dict,col:st):
|
29 |
-
|
30 |
number_of_pages = len(content)
|
31 |
-
col.info("
|
32 |
|
33 |
number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
|
34 |
#0 means all pages
|
35 |
if number > 0:
|
36 |
page : dict = content[f"page_{number-1}"]
|
|
|
37 |
option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
|
|
|
38 |
if option == "images":
|
39 |
if number == 0:
|
40 |
images = [img for page in content.values() for img in page["images"]]
|
@@ -62,55 +63,132 @@ def display_content_doc(content:dict,col:st):
|
|
62 |
links = [link for page in content.values() for link in page["liens"]]
|
63 |
else:
|
64 |
links = page["liens"]
|
|
|
65 |
for i, link in enumerate(links):
|
66 |
-
col.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
|
76 |
audit = st.session_state.audit["audit"]
|
77 |
-
content = st.session_state.audit["content"]
|
78 |
-
#global audit
|
79 |
-
audit_simplified = {
|
80 |
-
"Nombre de pages": audit["number_of_pages"],
|
81 |
-
"Nombre d'images": audit["number_of_images"],
|
82 |
-
"Nombre de liens": audit["number_of_links"],
|
83 |
-
"Nombre de tableaux": audit["number_of_tables"],
|
84 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
85 |
-
"Nombre de mots": audit["number_of_words"],
|
86 |
-
"Mots clés": audit["key_words"]
|
87 |
-
}
|
88 |
|
89 |
well_formatted_audit = "Contenus audités\n"
|
90 |
for key, value in audit_simplified.items():
|
91 |
well_formatted_audit += f"- {key}: {value}\n"
|
92 |
|
93 |
-
|
94 |
col.code(well_formatted_audit)
|
95 |
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
well_formatted_audit_page
|
110 |
-
|
111 |
-
|
|
|
|
|
112 |
|
113 |
-
return content
|
114 |
|
115 |
|
116 |
def audit_main():
|
@@ -128,97 +206,22 @@ def audit_main():
|
|
128 |
st.session_state.audit = {}
|
129 |
if "name_file" not in st.session_state:
|
130 |
st.session_state.name_file = ""
|
|
|
|
|
131 |
|
132 |
# File uploader
|
133 |
uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents")
|
134 |
|
135 |
if uploaded_file is not None:
|
136 |
type = classify_file(uploaded_file)
|
|
|
137 |
|
138 |
col1.write(f"Type de fichier: {type}")
|
139 |
|
140 |
-
col1.write("### Synthèse audit
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
if type == "pdf":
|
145 |
-
content = display_audit_pdf(uploaded_file,col1)
|
146 |
-
with col2.expander("Contenu"):
|
147 |
-
display_content_doc(content,st)
|
148 |
-
|
149 |
-
elif type == "audio":
|
150 |
-
if st.session_state.name_file != uploaded_file.name:
|
151 |
-
st.session_state.name_file = uploaded_file.name
|
152 |
-
with st.spinner("Analyse de l'audio..."):
|
153 |
-
st.session_state.audit = evaluate_audio_quality(uploaded_file)
|
154 |
-
audit = st.session_state.audit
|
155 |
-
|
156 |
-
#audit global simplifié
|
157 |
-
audit_simplified = {
|
158 |
-
"Durée": f"{audit['duration']:0.2f} minutes",
|
159 |
-
"Nombre de mots": audit["number_of_words"],
|
160 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
161 |
-
"Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)",
|
162 |
-
"SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)",
|
163 |
-
}
|
164 |
-
|
165 |
-
well_formatted_audit = "Contenus audités\n"
|
166 |
-
for key, value in audit_simplified.items():
|
167 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
168 |
-
|
169 |
-
col1.code(well_formatted_audit)
|
170 |
-
|
171 |
-
with col2.expander("Transcription"):
|
172 |
-
st.write(audit["transcription"])
|
173 |
-
if st.button("📋",key="copy_transcription"):
|
174 |
-
pyperclip.copy(audit["transcription"])
|
175 |
-
st.success("Transcription copiée dans le presse-papier")
|
176 |
-
|
177 |
-
elif type == "text":
|
178 |
-
text = uploaded_file.read().decode("utf-8")
|
179 |
-
if st.session_state.name_file != uploaded_file.name:
|
180 |
-
st.session_state.name_file = uploaded_file.name
|
181 |
-
with st.spinner("Analyse du texte..."):
|
182 |
-
st.session_state.audit = audit_text(text)
|
183 |
-
audit = st.session_state.audit
|
184 |
-
|
185 |
-
#audit global simplifié
|
186 |
-
audit_simplified = {
|
187 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
188 |
-
"Nombre de mots": audit["number_of_words"]
|
189 |
-
}
|
190 |
-
|
191 |
-
well_formatted_audit = "Audit descriptif\n"
|
192 |
-
for key, value in audit_simplified.items():
|
193 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
194 |
-
|
195 |
-
col1.code(well_formatted_audit)
|
196 |
-
|
197 |
-
with col2.expander("Texte"):
|
198 |
-
st.text_area("Texte",text,height=200)
|
199 |
-
|
200 |
-
elif type == "word":
|
201 |
-
if st.session_state.name_file != uploaded_file.name:
|
202 |
-
st.session_state.name_file = uploaded_file.name
|
203 |
-
with st.spinner("Analyse du document..."):
|
204 |
-
st.session_state.audit = audit_descriptif_word(uploaded_file)
|
205 |
-
audit = st.session_state.audit
|
206 |
-
|
207 |
-
#global audit
|
208 |
-
audit_simplified = {
|
209 |
-
"Nombre de pages": audit["number_of_paragraphs"],
|
210 |
-
"Nombre d'images": audit["number_of_images"],
|
211 |
-
"Nombre de liens": audit["number_of_links"],
|
212 |
-
"Nombre de tableaux": audit["number_of_tables"],
|
213 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
214 |
-
"Nombre de mots": audit["number_of_words"]
|
215 |
-
}
|
216 |
-
|
217 |
-
well_formatted_audit = "Contenus audités\n"
|
218 |
-
for key, value in audit_simplified.items():
|
219 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
220 |
-
|
221 |
-
st.code(well_formatted_audit)
|
222 |
|
|
|
|
|
|
|
223 |
|
224 |
audit_main()
|
|
|
1 |
import streamlit as st
|
2 |
import pymupdf as fitz
|
3 |
import pyperclip
|
4 |
+
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text
|
5 |
import dotenv
|
6 |
from utils.audit.audit_audio import evaluate_audio_quality
|
7 |
from PIL import Image
|
|
|
26 |
|
27 |
#display content
|
28 |
def display_content_doc(content:dict,col:st):
|
|
|
29 |
number_of_pages = len(content)
|
30 |
+
col.info("Note : Si vous choisissez 0, vous verrez le contenu de toutes les pages")
|
31 |
|
32 |
number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
|
33 |
#0 means all pages
|
34 |
if number > 0:
|
35 |
page : dict = content[f"page_{number-1}"]
|
36 |
+
|
37 |
option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
|
38 |
+
|
39 |
if option == "images":
|
40 |
if number == 0:
|
41 |
images = [img for page in content.values() for img in page["images"]]
|
|
|
63 |
links = [link for page in content.values() for link in page["liens"]]
|
64 |
else:
|
65 |
links = page["liens"]
|
66 |
+
|
67 |
for i, link in enumerate(links):
|
68 |
+
col.markdown(f"- {i+1}: [{link['uri']}]({link["uri"]}) (page {link['page']})")
|
69 |
+
|
70 |
+
elif option == "tableaux":
|
71 |
+
if number == 0:
|
72 |
+
tables = [table for page in content.values() for table in page["tableaux"]]
|
73 |
+
else:
|
74 |
+
tables = page["tableaux"]
|
75 |
|
76 |
+
for i, table in enumerate(tables):
|
77 |
+
col.write(f"Tableau {i+1}")
|
78 |
+
col.write(table)
|
79 |
+
|
80 |
+
def display_content_audio(content:dict,col:st):
|
81 |
+
st.write("##### Transcription")
|
82 |
+
st.write(content["transcription"])
|
83 |
+
if st.button("📋",key="copy_transcription"):
|
84 |
+
pyperclip.copy(content["transcription"])
|
85 |
+
st.success("Transcription copiée dans le presse-papier")
|
86 |
+
|
87 |
+
st.audio(content["audio_data"],sample_rate=content["frame_rate"]*2)
|
88 |
|
89 |
+
def display_content_text(content,col:st):
|
90 |
+
st.text_area("Texte",content,height=200)
|
91 |
+
|
92 |
+
def handle_display_content(col:st):
|
93 |
+
audit = st.session_state.audit
|
94 |
+
type = st.session_state.audit_simplified["type de fichier"]
|
95 |
+
if type == "pdf":
|
96 |
+
with col.expander("Contenu"):
|
97 |
+
display_content_doc(audit["content"],st)
|
98 |
+
elif type == "audio":
|
99 |
+
with col.expander("Contenu"):
|
100 |
+
display_content_audio(audit["content"],col)
|
101 |
+
elif type == "text":
|
102 |
+
with col.expander("Contenu"):
|
103 |
+
display_content_text(audit["content"],col)
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def handle_audit(uploaded_file,type:str):
|
108 |
+
if type == "pdf":
|
109 |
+
if st.session_state.name_file != uploaded_file.name:
|
110 |
+
st.session_state.name_file = uploaded_file.name
|
111 |
+
with st.spinner("Analyse du document..."):
|
112 |
+
st.session_state.audit = audit_descriptif_pdf(uploaded_file,100)
|
113 |
+
audit = st.session_state.audit["audit"]
|
114 |
+
#global audit
|
115 |
+
audit_simplified = {
|
116 |
+
"type de fichier": type,
|
117 |
+
"Nombre de pages": audit["number_of_pages"],
|
118 |
+
"Nombre d'images": audit["number_of_images"],
|
119 |
+
"Nombre de liens": audit["number_of_links"],
|
120 |
+
"Nombre de tableaux": audit["number_of_tables"],
|
121 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
122 |
+
"Nombre de mots": audit["number_of_words"],
|
123 |
+
"Mots clés": audit["key_words"]
|
124 |
+
}
|
125 |
+
st.session_state.audit_simplified = audit_simplified
|
126 |
|
127 |
+
elif type == "audio":
|
128 |
+
if st.session_state.name_file != uploaded_file.name:
|
129 |
+
st.session_state.name_file = uploaded_file.name
|
130 |
+
with st.spinner("Analyse de l'audio..."):
|
131 |
+
st.session_state.audit = evaluate_audio_quality(uploaded_file)
|
132 |
+
audit = st.session_state.audit["audit"]
|
133 |
+
#audit global simplifié
|
134 |
+
audit_simplified = {
|
135 |
+
"type de fichier": type,
|
136 |
+
"Durée": f"{audit['duration']:0.2f} minutes",
|
137 |
+
"Nombre de mots": audit["number_of_words"],
|
138 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
139 |
+
"Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)",
|
140 |
+
"SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)",
|
141 |
+
}
|
142 |
+
st.session_state.audit_simplified = audit_simplified
|
143 |
+
|
144 |
+
elif type == "text":
|
145 |
+
text = uploaded_file.read().decode("utf-8")
|
146 |
+
if st.session_state.name_file != uploaded_file.name:
|
147 |
+
st.session_state.name_file = uploaded_file.name
|
148 |
+
with st.spinner("Analyse du texte..."):
|
149 |
+
st.session_state.audit = audit_text(text)
|
150 |
+
audit = st.session_state.audit["audit"]
|
151 |
+
#audit global simplifié
|
152 |
+
audit_simplified = {
|
153 |
+
"type de fichier": type,
|
154 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
155 |
+
"Nombre de mots": audit["number_of_words"]
|
156 |
+
}
|
157 |
+
st.session_state.audit_simplified = audit_simplified
|
158 |
+
|
159 |
+
|
160 |
|
161 |
+
|
162 |
+
def display_audit(col:st):
|
163 |
+
#audit global simplifié
|
164 |
+
audit_simplified = st.session_state.audit_simplified
|
|
|
165 |
audit = st.session_state.audit["audit"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
well_formatted_audit = "Contenus audités\n"
|
168 |
for key, value in audit_simplified.items():
|
169 |
well_formatted_audit += f"- {key}: {value}\n"
|
170 |
|
|
|
171 |
col.code(well_formatted_audit)
|
172 |
|
173 |
+
if audit_simplified["type de fichier"] == "pdf": #cad un type qui contient des pages
|
174 |
+
#audit par page
|
175 |
+
with col.expander("Audit par page"):
|
176 |
+
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
|
177 |
+
audit_page = audit[f"page_{number-1}"]
|
178 |
+
audit_page = {
|
179 |
+
|
180 |
+
"Nombre d'images": audit_page["number_of_images"],
|
181 |
+
"Nombre de liens": audit_page["number_of_links"],
|
182 |
+
"Nombre de tableaux": audit_page["number_of_tables"],
|
183 |
+
"Nombre de tokens": audit_page["number_of_tokens"],
|
184 |
+
"Nombre de mots": audit_page["number_of_words"],
|
185 |
+
}
|
186 |
+
well_formatted_audit_page = "Audit descriptif\n"
|
187 |
+
for key, value in audit_page.items():
|
188 |
+
well_formatted_audit_page += f"- {key}: {value}\n"
|
189 |
+
|
190 |
+
st.code(well_formatted_audit_page)
|
191 |
|
|
|
192 |
|
193 |
|
194 |
def audit_main():
|
|
|
206 |
st.session_state.audit = {}
|
207 |
if "name_file" not in st.session_state:
|
208 |
st.session_state.name_file = ""
|
209 |
+
if "audit_simplified" not in st.session_state:
|
210 |
+
st.session_state.audit_simplified = {}
|
211 |
|
212 |
# File uploader
|
213 |
uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents")
|
214 |
|
215 |
if uploaded_file is not None:
|
216 |
type = classify_file(uploaded_file)
|
217 |
+
handle_audit(uploaded_file,type)
|
218 |
|
219 |
col1.write(f"Type de fichier: {type}")
|
220 |
|
221 |
+
col1.write("### Synthèse audit de(s) document(s) téléchargé(s)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
+
if "audit" in st.session_state and st.session_state.audit != {}:
|
224 |
+
display_audit(col1)
|
225 |
+
handle_display_content(col2)
|
226 |
|
227 |
audit_main()
|
requirements.txt
CHANGED
@@ -11,3 +11,4 @@ nltk
|
|
11 |
rake_nltk
|
12 |
python-docx
|
13 |
pillow
|
|
|
|
11 |
rake_nltk
|
12 |
python-docx
|
13 |
pillow
|
14 |
+
pandas
|
utils/audit/audit_audio.py
CHANGED
@@ -39,7 +39,7 @@ def evaluate_audio_quality(file) -> dict:
|
|
39 |
audio_data = np.array(audio.get_array_of_samples())
|
40 |
|
41 |
#number of minutes
|
42 |
-
duration = len(audio_data) / audio.frame_rate / 60
|
43 |
|
44 |
# Calculate volume
|
45 |
volume = audio.dBFS
|
@@ -49,6 +49,24 @@ def evaluate_audio_quality(file) -> dict:
|
|
49 |
|
50 |
#get the transcription of the audio
|
51 |
transcription = transcript_audio_func(file)
|
52 |
-
|
53 |
-
return {"volume": volume, "SNR": snr,"transcription": transcription,"number_of_tokens": count_tokens(transcription),"duration": duration, "number_of_words": len(transcription.split())}
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
audio_data = np.array(audio.get_array_of_samples())
|
40 |
|
41 |
#number of minutes
|
42 |
+
duration = len(audio_data) / audio.frame_rate*2 / 60
|
43 |
|
44 |
# Calculate volume
|
45 |
volume = audio.dBFS
|
|
|
49 |
|
50 |
#get the transcription of the audio
|
51 |
transcription = transcript_audio_func(file)
|
|
|
|
|
52 |
|
53 |
+
audit = {
|
54 |
+
"volume": volume,
|
55 |
+
"SNR": snr,
|
56 |
+
"duration": duration,
|
57 |
+
"number_of_tokens": count_tokens(transcription),
|
58 |
+
"number_of_words": len(transcription.split())
|
59 |
+
}
|
60 |
+
|
61 |
+
content = {
|
62 |
+
"transcription": transcription,
|
63 |
+
"audio_data": audio_data,
|
64 |
+
"frame_rate": audio.frame_rate
|
65 |
+
}
|
66 |
+
|
67 |
+
audit_global = {
|
68 |
+
"audit": audit,
|
69 |
+
"content": content
|
70 |
+
}
|
71 |
+
|
72 |
+
return audit_global
|
utils/audit/audit_doc.py
CHANGED
@@ -98,7 +98,8 @@ def audit_descriptif_pdf(file,max_img_width) -> dict:
|
|
98 |
page_content = {
|
99 |
"images": [],
|
100 |
"texte": "",
|
101 |
-
"liens": []
|
|
|
102 |
}
|
103 |
|
104 |
#number of images
|
@@ -140,7 +141,10 @@ def audit_descriptif_pdf(file,max_img_width) -> dict:
|
|
140 |
audit_dict_doc["number_of_links"] += number_links
|
141 |
|
142 |
#number of tables
|
143 |
-
|
|
|
|
|
|
|
144 |
audit_dict_page["number_of_tables"] = number_tables
|
145 |
audit_dict_doc["number_of_tables"] += number_tables
|
146 |
|
@@ -170,7 +174,8 @@ def audit_descriptif_pdf(file,max_img_width) -> dict:
|
|
170 |
- {list_key_words_text}
|
171 |
Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.
|
172 |
|
173 |
-
REPONSE:
|
|
|
174 |
'''
|
175 |
key_words_extracted = extract_relevant_keywords(prompt)
|
176 |
audit_dict_doc["key_words"] = "\n" + key_words_extracted
|
@@ -186,45 +191,14 @@ def audit_descriptif_pdf(file,max_img_width) -> dict:
|
|
186 |
def audit_text(text: str) -> dict:
|
187 |
audit_dict = {
|
188 |
"number_of_tokens": count_tokens(text),
|
189 |
-
"number_of_words": len(text.split())
|
190 |
}
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
document = Document(io.BytesIO(file.read()))
|
196 |
-
|
197 |
-
audit_dict_doc = {
|
198 |
-
"number_of_paragraphs": 0,
|
199 |
-
"number_of_images": 0,
|
200 |
-
"number_of_links": 0,
|
201 |
-
"number_of_tables": 0,
|
202 |
-
"number_of_tokens": 0,
|
203 |
-
"number_of_words": 0
|
204 |
}
|
205 |
|
206 |
-
|
207 |
-
audit_dict_doc["number_of_paragraphs"] += 1
|
208 |
-
text = para.text
|
209 |
-
|
210 |
-
# Count tokens and words in the paragraph
|
211 |
-
number_tokens = count_tokens(text)
|
212 |
-
number_words = len(text.split())
|
213 |
-
|
214 |
-
audit_dict_doc["number_of_tokens"] += number_tokens
|
215 |
-
audit_dict_doc["number_of_words"] += number_words
|
216 |
-
|
217 |
-
# Count links (assuming they are hyperlinks)
|
218 |
-
for run in para.runs:
|
219 |
-
if run.link:
|
220 |
-
audit_dict_doc["number_of_links"] += 1
|
221 |
-
|
222 |
-
for table in document.tables:
|
223 |
-
audit_dict_doc["number_of_tables"] += 1
|
224 |
-
|
225 |
-
# Counting images (inline shapes and pictures)
|
226 |
-
for shape in document.inline_shapes:
|
227 |
-
audit_dict_doc["number_of_images"] += 1
|
228 |
|
229 |
-
return audit_dict_doc
|
230 |
|
|
|
98 |
page_content = {
|
99 |
"images": [],
|
100 |
"texte": "",
|
101 |
+
"liens": [],
|
102 |
+
"tableaux": []
|
103 |
}
|
104 |
|
105 |
#number of images
|
|
|
141 |
audit_dict_doc["number_of_links"] += number_links
|
142 |
|
143 |
#number of tables
|
144 |
+
tables = page.find_tables().tables
|
145 |
+
number_tables = len(tables)
|
146 |
+
for tab in tables:
|
147 |
+
page_content["tableaux"].append(tab.to_pandas())
|
148 |
audit_dict_page["number_of_tables"] = number_tables
|
149 |
audit_dict_doc["number_of_tables"] += number_tables
|
150 |
|
|
|
174 |
- {list_key_words_text}
|
175 |
Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.
|
176 |
|
177 |
+
TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT :
|
178 |
+
key_word1, key_word2, key_word3, key_word4, key_word5
|
179 |
'''
|
180 |
key_words_extracted = extract_relevant_keywords(prompt)
|
181 |
audit_dict_doc["key_words"] = "\n" + key_words_extracted
|
|
|
191 |
def audit_text(text: str) -> dict:
|
192 |
audit_dict = {
|
193 |
"number_of_tokens": count_tokens(text),
|
194 |
+
"number_of_words": len(text.split()),
|
195 |
}
|
196 |
|
197 |
+
global_audit = {
|
198 |
+
"audit": audit_dict,
|
199 |
+
"content": text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
}
|
201 |
|
202 |
+
return global_audit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
|
|
204 |
|
utils/audit/response_llm.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
from langchain_core.output_parsers import StrOutputParser
|
4 |
+
from langchain_core.prompts import PromptTemplate
|
5 |
+
|
6 |
+
def generate_response_openai(prompt: str,stream:bool = False) -> str:
|
7 |
+
client = OpenAI()
|
8 |
+
response = client.chat.completions.create(
|
9 |
+
model="gpt-4o-mini",
|
10 |
+
messages=[
|
11 |
+
{"role": "user", "content": prompt}
|
12 |
+
],
|
13 |
+
stream=stream
|
14 |
+
)
|
15 |
+
|
16 |
+
return response.choices[0].message.content
|
17 |
+
|
18 |
+
|
19 |
+
def generate_response_via_langchain(query: str, stream: bool = False, model: str = "gpt-4o-mini") :
|
20 |
+
# Define the prompt template
|
21 |
+
template = "You are an agent designer expert: {query}"
|
22 |
+
prompt = PromptTemplate.from_template(template)
|
23 |
+
|
24 |
+
# Initialize the OpenAI LLM with the specified model
|
25 |
+
llm = ChatOpenAI(model=model)
|
26 |
+
|
27 |
+
# Create an LLM chain with the prompt and the LLM
|
28 |
+
llm_chain = prompt | llm | StrOutputParser()
|
29 |
+
|
30 |
+
if stream:
|
31 |
+
# Return a generator that yields streamed responses
|
32 |
+
return llm_chain.stream({"query": query})
|
33 |
+
|
34 |
+
# Invoke the LLM chain and return the result
|
35 |
+
return llm_chain.invoke({"query": query})
|