Ilyas KHIAT commited on
Commit
bc557f4
·
1 Parent(s): 62a5ad1

more details content

Browse files
app.py CHANGED
@@ -116,25 +116,28 @@ def main():
116
  # Streamlit app
117
  st.title("AUDIT DES DOCUMENTS")
118
 
 
 
 
 
119
  if "audit" not in st.session_state:
120
  st.session_state.audit = {}
121
  if "name_file" not in st.session_state:
122
  st.session_state.name_file = ""
123
 
124
  # File uploader
125
- uploaded_file = st.file_uploader("Télécharger un ou plusieurs des documents")
126
 
127
  if uploaded_file is not None:
128
  type = classify_file(uploaded_file)
129
 
130
  st.write(f"Type de fichier: {type}")
131
 
132
- st.write("### Audit de tout le document")
133
 
134
  if type == "pdf":
135
  display_audit_pdf(uploaded_file)
136
 
137
-
138
  elif type == "audio":
139
  if st.session_state.name_file != uploaded_file.name:
140
  st.session_state.name_file = uploaded_file.name
@@ -179,28 +182,28 @@ def main():
179
 
180
  st.code(well_formatted_audit)
181
 
182
- elif type == "word":
183
- if st.session_state.name_file != uploaded_file.name:
184
- st.session_state.name_file = uploaded_file.name
185
- with st.spinner("Analyse du document..."):
186
- st.session_state.audit = audit_descriptif_word(uploaded_file)
187
- audit = st.session_state.audit
188
-
189
- #global audit
190
- audit_simplified = {
191
- "Nombre de pages": audit["number_of_paragraphs"],
192
- "Nombre d'images": audit["number_of_images"],
193
- "Nombre de liens": audit["number_of_links"],
194
- "Nombre de tableaux": audit["number_of_tables"],
195
- "Nombre de tokens": audit["number_of_tokens"],
196
- "Nombre de mots": audit["number_of_words"]
197
- }
198
 
199
- well_formatted_audit = "Contenus audités\n"
200
- for key, value in audit_simplified.items():
201
- well_formatted_audit += f"- {key}: {value}\n"
202
 
203
- st.code(well_formatted_audit)
204
 
205
 
206
  if __name__ == "__main__":
 
116
  # Streamlit app
117
  st.title("AUDIT DES DOCUMENTS")
118
 
119
+ notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
120
+
121
+ st.markdown(notice)
122
+
123
  if "audit" not in st.session_state:
124
  st.session_state.audit = {}
125
  if "name_file" not in st.session_state:
126
  st.session_state.name_file = ""
127
 
128
  # File uploader
129
+ uploaded_file = st.file_uploader("Télécharger un ou plusieurs documents")
130
 
131
  if uploaded_file is not None:
132
  type = classify_file(uploaded_file)
133
 
134
  st.write(f"Type de fichier: {type}")
135
 
136
+ st.write("### Synthèse audit du ou des document(s) téléchargé(s)")
137
 
138
  if type == "pdf":
139
  display_audit_pdf(uploaded_file)
140
 
 
141
  elif type == "audio":
142
  if st.session_state.name_file != uploaded_file.name:
143
  st.session_state.name_file = uploaded_file.name
 
182
 
183
  st.code(well_formatted_audit)
184
 
185
+ elif type == "word":
186
+ if st.session_state.name_file != uploaded_file.name:
187
+ st.session_state.name_file = uploaded_file.name
188
+ with st.spinner("Analyse du document..."):
189
+ st.session_state.audit = audit_descriptif_word(uploaded_file)
190
+ audit = st.session_state.audit
191
+
192
+ #global audit
193
+ audit_simplified = {
194
+ "Nombre de pages": audit["number_of_paragraphs"],
195
+ "Nombre d'images": audit["number_of_images"],
196
+ "Nombre de liens": audit["number_of_links"],
197
+ "Nombre de tableaux": audit["number_of_tables"],
198
+ "Nombre de tokens": audit["number_of_tokens"],
199
+ "Nombre de mots": audit["number_of_words"]
200
+ }
201
 
202
+ well_formatted_audit = "Contenus audités\n"
203
+ for key, value in audit_simplified.items():
204
+ well_formatted_audit += f"- {key}: {value}\n"
205
 
206
+ st.code(well_formatted_audit)
207
 
208
 
209
  if __name__ == "__main__":
utils/audit/audit_doc.py CHANGED
@@ -54,10 +54,12 @@ def evaluate_text_quality(text: str) -> dict:
54
  global_score_0_5 = global_score * 5
55
 
56
  def extract_keywords(text):
57
- rake = Rake(stopwords.words('english'))
58
  rake.extract_keywords_from_text(text)
59
  return rake.get_ranked_phrases()
60
 
 
 
61
  def count_tokens(input_string: str) -> int:
62
  tokenizer = tiktoken.get_encoding("cl100k_base")
63
  tokens = tokenizer.encode(input_string)
@@ -169,12 +171,6 @@ def audit_text(text: str) -> dict:
169
 
170
  return audit_dict
171
 
172
-
173
-
174
- def count_tokens(text):
175
- # Implement a token counting method. Here, we assume tokens are words.
176
- return len(text.split())
177
-
178
  def audit_descriptif_word(file) -> dict:
179
  document = Document(io.BytesIO(file.read()))
180
 
 
54
  global_score_0_5 = global_score * 5
55
 
56
  def extract_keywords(text):
57
+ rake = Rake(stopwords.words('french'))
58
  rake.extract_keywords_from_text(text)
59
  return rake.get_ranked_phrases()
60
 
61
+
62
+
63
  def count_tokens(input_string: str) -> int:
64
  tokenizer = tiktoken.get_encoding("cl100k_base")
65
  tokens = tokenizer.encode(input_string)
 
171
 
172
  return audit_dict
173
 
 
 
 
 
 
 
174
  def audit_descriptif_word(file) -> dict:
175
  document = Document(io.BytesIO(file.read()))
176
 
utils/audit/transcript_audio.py CHANGED
@@ -1,8 +1,5 @@
1
  from openai import OpenAI
2
 
3
-
4
-
5
-
6
  def transcript_audio_func(audio_file):
7
  client = OpenAI()
8
  transcription = client.audio.transcriptions.create(
 
1
  from openai import OpenAI
2
 
 
 
 
3
  def transcript_audio_func(audio_file):
4
  client = OpenAI()
5
  transcription = client.audio.transcriptions.create(