Spaces:
Running
Running
Ilyas KHIAT
commited on
Commit
·
bc557f4
1
Parent(s):
62a5ad1
more details content
Browse files- app.py +26 -23
- utils/audit/audit_doc.py +3 -7
- utils/audit/transcript_audio.py +0 -3
app.py
CHANGED
@@ -116,25 +116,28 @@ def main():
|
|
116 |
# Streamlit app
|
117 |
st.title("AUDIT DES DOCUMENTS")
|
118 |
|
|
|
|
|
|
|
|
|
119 |
if "audit" not in st.session_state:
|
120 |
st.session_state.audit = {}
|
121 |
if "name_file" not in st.session_state:
|
122 |
st.session_state.name_file = ""
|
123 |
|
124 |
# File uploader
|
125 |
-
uploaded_file = st.file_uploader("Télécharger un ou plusieurs
|
126 |
|
127 |
if uploaded_file is not None:
|
128 |
type = classify_file(uploaded_file)
|
129 |
|
130 |
st.write(f"Type de fichier: {type}")
|
131 |
|
132 |
-
st.write("###
|
133 |
|
134 |
if type == "pdf":
|
135 |
display_audit_pdf(uploaded_file)
|
136 |
|
137 |
-
|
138 |
elif type == "audio":
|
139 |
if st.session_state.name_file != uploaded_file.name:
|
140 |
st.session_state.name_file = uploaded_file.name
|
@@ -179,28 +182,28 @@ def main():
|
|
179 |
|
180 |
st.code(well_formatted_audit)
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
|
203 |
-
|
204 |
|
205 |
|
206 |
if __name__ == "__main__":
|
|
|
116 |
# Streamlit app
|
117 |
st.title("AUDIT DES DOCUMENTS")
|
118 |
|
119 |
+
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
|
120 |
+
|
121 |
+
st.markdown(notice)
|
122 |
+
|
123 |
if "audit" not in st.session_state:
|
124 |
st.session_state.audit = {}
|
125 |
if "name_file" not in st.session_state:
|
126 |
st.session_state.name_file = ""
|
127 |
|
128 |
# File uploader
|
129 |
+
uploaded_file = st.file_uploader("Télécharger un ou plusieurs documents")
|
130 |
|
131 |
if uploaded_file is not None:
|
132 |
type = classify_file(uploaded_file)
|
133 |
|
134 |
st.write(f"Type de fichier: {type}")
|
135 |
|
136 |
+
st.write("### Synthèse audit du ou des document(s) téléchargé(s)")
|
137 |
|
138 |
if type == "pdf":
|
139 |
display_audit_pdf(uploaded_file)
|
140 |
|
|
|
141 |
elif type == "audio":
|
142 |
if st.session_state.name_file != uploaded_file.name:
|
143 |
st.session_state.name_file = uploaded_file.name
|
|
|
182 |
|
183 |
st.code(well_formatted_audit)
|
184 |
|
185 |
+
elif type == "word":
|
186 |
+
if st.session_state.name_file != uploaded_file.name:
|
187 |
+
st.session_state.name_file = uploaded_file.name
|
188 |
+
with st.spinner("Analyse du document..."):
|
189 |
+
st.session_state.audit = audit_descriptif_word(uploaded_file)
|
190 |
+
audit = st.session_state.audit
|
191 |
+
|
192 |
+
#global audit
|
193 |
+
audit_simplified = {
|
194 |
+
"Nombre de pages": audit["number_of_paragraphs"],
|
195 |
+
"Nombre d'images": audit["number_of_images"],
|
196 |
+
"Nombre de liens": audit["number_of_links"],
|
197 |
+
"Nombre de tableaux": audit["number_of_tables"],
|
198 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
199 |
+
"Nombre de mots": audit["number_of_words"]
|
200 |
+
}
|
201 |
|
202 |
+
well_formatted_audit = "Contenus audités\n"
|
203 |
+
for key, value in audit_simplified.items():
|
204 |
+
well_formatted_audit += f"- {key}: {value}\n"
|
205 |
|
206 |
+
st.code(well_formatted_audit)
|
207 |
|
208 |
|
209 |
if __name__ == "__main__":
|
utils/audit/audit_doc.py
CHANGED
@@ -54,10 +54,12 @@ def evaluate_text_quality(text: str) -> dict:
|
|
54 |
global_score_0_5 = global_score * 5
|
55 |
|
56 |
def extract_keywords(text):
|
57 |
-
rake = Rake(stopwords.words('
|
58 |
rake.extract_keywords_from_text(text)
|
59 |
return rake.get_ranked_phrases()
|
60 |
|
|
|
|
|
61 |
def count_tokens(input_string: str) -> int:
|
62 |
tokenizer = tiktoken.get_encoding("cl100k_base")
|
63 |
tokens = tokenizer.encode(input_string)
|
@@ -169,12 +171,6 @@ def audit_text(text: str) -> dict:
|
|
169 |
|
170 |
return audit_dict
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
def count_tokens(text):
|
175 |
-
# Implement a token counting method. Here, we assume tokens are words.
|
176 |
-
return len(text.split())
|
177 |
-
|
178 |
def audit_descriptif_word(file) -> dict:
|
179 |
document = Document(io.BytesIO(file.read()))
|
180 |
|
|
|
54 |
global_score_0_5 = global_score * 5
|
55 |
|
56 |
def extract_keywords(text):
|
57 |
+
rake = Rake(stopwords.words('french'))
|
58 |
rake.extract_keywords_from_text(text)
|
59 |
return rake.get_ranked_phrases()
|
60 |
|
61 |
+
|
62 |
+
|
63 |
def count_tokens(input_string: str) -> int:
|
64 |
tokenizer = tiktoken.get_encoding("cl100k_base")
|
65 |
tokens = tokenizer.encode(input_string)
|
|
|
171 |
|
172 |
return audit_dict
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
def audit_descriptif_word(file) -> dict:
|
175 |
document = Document(io.BytesIO(file.read()))
|
176 |
|
utils/audit/transcript_audio.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
from openai import OpenAI
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
def transcript_audio_func(audio_file):
|
7 |
client = OpenAI()
|
8 |
transcription = client.audio.transcriptions.create(
|
|
|
1 |
from openai import OpenAI
|
2 |
|
|
|
|
|
|
|
3 |
def transcript_audio_func(audio_file):
|
4 |
client = OpenAI()
|
5 |
transcription = client.audio.transcriptions.create(
|