Salif SAWADOGO commited on
Commit
7204409
·
1 Parent(s): f27ce21

⚡️ improve code quality and use dcc instead global variable

Browse files
.gitignore CHANGED
@@ -1,7 +1,7 @@
1
  .venv
2
  **__pycache__**
3
  **.pyc**
4
- **/env/**
5
  **/.venv/**
6
  **/*.egg-info
7
  **/*parquet
 
1
  .venv
2
  **__pycache__**
3
  **.pyc**
4
+ **/*env/**
5
  **/.venv/**
6
  **/*.egg-info
7
  **/*parquet
app/global_vars.py CHANGED
@@ -4,7 +4,6 @@ from clients import s3_loader
4
  from utils import extract_audio_identifier
5
 
6
  DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
7
- audio_paths, possible_values = [], []
8
  data = load_dataset(DATA_FILE, split="train").to_pandas()
9
  data[["chapter", "page"]] = data["moore_source_url"].apply(
10
  lambda x: pd.Series(extract_audio_identifier(x))
 
4
  from utils import extract_audio_identifier
5
 
6
  DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
 
7
  data = load_dataset(DATA_FILE, split="train").to_pandas()
8
  data[["chapter", "page"]] = data["moore_source_url"].apply(
9
  lambda x: pd.Series(extract_audio_identifier(x))
app/helpers/__init__.py ADDED
File without changes
app/helpers/abstracts.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+ import pandas as pd
4
+
5
+ class AbstractS3Client(ABC):
6
+ @abstractmethod
7
+ def upload_file(self, local_path: str, s3_key: str) -> None:
8
+ """Upload un fichier vers S3."""
9
+ pass
10
+
11
+ @abstractmethod
12
+ def download_file(self, local_path: str, s3_key: str) -> None:
13
+ """Télécharge un fichier depuis S3."""
14
+ pass
15
+
16
+ @abstractmethod
17
+ def list_files(self, prefix: str = "") -> List[str]:
18
+ """Liste les fichiers dans S3 sous un préfixe donné."""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def load_json_files(self, files: List[str], unique_columns: List[str] = None) -> pd.DataFrame:
23
+ """Charge et combine des fichiers JSON en un DataFrame."""
24
+ pass
25
+
26
+ class AbstractProcessor(ABC):
27
+ @abstractmethod
28
+ def get_audio_paths(self, folder: str) -> list[str]:
29
+ """Retourne et trie les chemins audio d'un dossier."""
30
+ pass
31
+
32
+ @abstractmethod
33
+ def process_text(self, text: str) -> str:
34
+ """Nettoie et traite un texte."""
35
+ pass
36
+
37
+ @abstractmethod
38
+ def splitter(self, text: str) -> list[str]:
39
+ """Divise un texte en segments."""
40
+ pass
app/helpers/models.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # helpers/models.py
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ @dataclass
6
+ class S3Config:
7
+ """Configuration for S3 connection."""
8
+ bucket_name: str
9
+ endpoint_url: str
10
+ access_key: str
11
+ secret_key: str
12
+ region_name: Optional[str] = None
app/helpers/processor.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import os
4
+ import urllib.parse
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ from loguru import logger
8
+ from .abstracts import AbstractProcessor
9
+ from .s3 import S3Client # Si besoin d'utiliser des fonctions S3
10
+
11
+ class Processor(AbstractProcessor):
12
+ def get_audio_paths(self, folder: str) -> list[str]:
13
+ def extract_number(file_path: str) -> int:
14
+ match = re.search(r"segment_(\d+)", file_path)
15
+ return int(match.group(1)) if match else float("inf")
16
+ audio_paths = list(Path(folder).glob("*.mp3"))
17
+ audio_paths = [audio_path.as_posix() for audio_path in audio_paths]
18
+ audio_paths = sorted(audio_paths, key=extract_number)
19
+ return audio_paths[3:]
20
+
21
+ def process_text(self, text: str) -> str:
22
+ text = re.sub(r"\+\s*\.", ".", text)
23
+ text = re.sub(r"\*\s*\+\s*;", ";", text)
24
+ text = re.sub(r"\*\s*\+", "", text)
25
+ text = text.replace(" + ", " ").replace(" * ", " ").replace("+", " ")
26
+ text = re.sub(r'["“”]', "", text)
27
+ return text.strip()
28
+
29
+ def splitter(self, text: str) -> list[str]:
30
+ return re.split(r"[,:;.]", self.process_text(text))
31
+
32
+ # Ajoutez d'autres méthodes de traitement si nécessaire
33
+ def flatten_nested_values(self, nested_values: pd.Series) -> list[str]:
34
+ flattened = []
35
+ for group in nested_values:
36
+ for item in group:
37
+ cleaned_item = re.sub(r"^\d+\s*", "", item).strip()
38
+ if cleaned_item:
39
+ flattened.append(cleaned_item)
40
+ return flattened
41
+
42
+ def load_persistent_data(self, file: str) -> list:
43
+ if os.path.exists(file):
44
+ with open(file, "r", encoding="utf-8") as f:
45
+ return json.load(f)
46
+ return []
47
+
48
+ def save_persistent_data(self, data: list, file: str) -> None:
49
+ with open(file, "w", encoding="utf-8") as f:
50
+ json.dump(data, f, ensure_ascii=False, indent=2)
51
+
52
+ def extract_audio_identifier(self, url: str):
53
+ parts = url.strip("/").split("/")
54
+ return urllib.parse.unquote(parts[-2]), int(parts[-1])
55
+
56
+ def find_and_return_after_last(self, long_list: list, short_list: list) -> list:
57
+ last_index = -1
58
+ for i, item in enumerate(long_list):
59
+ if item in short_list:
60
+ last_index = i
61
+ return long_list[last_index+1:] if last_index != -1 else long_list
62
+
63
+ def load_page_verses_and_audios(self, s3_client, page: str, df_verses: pd.DataFrame) -> tuple[list[str], list[str]]:
64
+ audio_paths = self.get_audio_paths(page)
65
+ page_fixed = page.replace("/", "\\")
66
+ _, chapter, page_str = page_fixed.split("\\")
67
+ s3_key = f"labelling/{chapter}/{page_str}/results.json"
68
+ page_int = int(page_str.replace("page_", ""))
69
+ tmp = df_verses[(df_verses.chapter == chapter) & (df_verses.page == page_int)]
70
+ possible_values = tmp["moore_verse_text"].apply(self.splitter)
71
+ possible_values = self.flatten_nested_values(possible_values)
72
+ try:
73
+ # Vous pouvez injecter ici un client S3 si besoin, ou utiliser directement une fonction d'un module dédié
74
+ s3_client.download_file("result.json",s3_key)
75
+ transcriptions = self.load_persistent_data("result.json")
76
+ latest_transcription = transcriptions[-1].get("transcriptions")
77
+ latest_audio = [transcriptions[-1].get("segment_path")]
78
+ audio_paths = self.find_and_return_after_last(audio_paths, latest_audio)
79
+ possible_values = self.find_and_return_after_last(possible_values, latest_transcription)
80
+ logger.info(f"Latest transcription: {latest_audio} / {latest_transcription}")
81
+ return possible_values, audio_paths
82
+ except Exception as e:
83
+ logger.error(f"An error occurred: {e}")
84
+ return possible_values, audio_paths
85
+
86
+ def get_contribution_data(self, s3_client) -> pd.DataFrame:
87
+ files = s3_client.list_files("labelling")
88
+ files = [file for file in files if file.endswith("json")]
89
+ try:
90
+ df = s3_client.load_json_files(files=files, unique_columns=["segment_path", "user_id"])
91
+ df[["tmp1", "chapter", "page", "segment"]] = df.segment_path.str.split("/", expand=True)
92
+ return (
93
+ df.sort_values(["chapter", "page"]).drop(columns=["tmp1", "segment_path"])
94
+ if not df.empty
95
+ else pd.DataFrame()
96
+ )
97
+ except Exception as e:
98
+ logger.error(f"Error in get_contribution_data: {e}")
99
+ return pd.DataFrame()
app/helpers/s3.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import boto3
4
+ import s3fs
5
+ from typing import List, Optional
6
+ from loguru import logger
7
+ from dotenv import load_dotenv
8
+ from .models import S3Config
9
+ from .abstracts import AbstractS3Client
10
+ import pandas as pd
11
+
12
+ load_dotenv()
13
+
14
+ class S3Client(AbstractS3Client):
15
+ def __init__(self, config: S3Config):
16
+ self.config = config
17
+ self._fs = s3fs.S3FileSystem(
18
+ key=self.config.access_key,
19
+ secret=self.config.secret_key,
20
+ client_kwargs=(
21
+ {
22
+ "endpoint_url": self.config.endpoint_url,
23
+ "region_name": self.config.region_name,
24
+ }
25
+ if self.config.region_name
26
+ else {"endpoint_url": self.config.endpoint_url}
27
+ ),
28
+ )
29
+ self._client = boto3.client(
30
+ "s3",
31
+ endpoint_url=self.config.endpoint_url,
32
+ aws_access_key_id=self.config.access_key,
33
+ aws_secret_access_key=self.config.secret_key,
34
+ )
35
+
36
+ def upload_file(self, local_path: str, s3_key: str) -> None:
37
+ if os.path.exists(local_path):
38
+ self._client.upload_file(local_path, self.config.bucket_name, s3_key)
39
+ logger.info(f"Uploaded {local_path} to s3://{self.config.bucket_name}/{s3_key}")
40
+ else:
41
+ logger.error(f"File {local_path} does not exist.")
42
+
43
+ def download_file(self, local_path: str, s3_key: str) -> None:
44
+ self._client.download_file(self.config.bucket_name, s3_key, local_path)
45
+ logger.info(f"Downloaded {s3_key} to {local_path}")
46
+
47
+ def list_files(self, prefix: str = "") -> List[str]:
48
+ paginator = self._client.get_paginator("list_objects_v2")
49
+ files = []
50
+ try:
51
+ for page in paginator.paginate(Bucket=self.config.bucket_name, Prefix=prefix):
52
+ for obj in page.get("Contents", []):
53
+ files.append(obj["Key"])
54
+ except Exception as e:
55
+ raise Exception(f"Error listing files in bucket {self.config.bucket_name}: {str(e)}")
56
+ return files
57
+
58
+ def load_json_files(self, files: List[str], unique_columns: Optional[List[str]] = None) -> pd.DataFrame:
59
+ data = []
60
+ for file in files:
61
+ try:
62
+ with self._fs.open(f"s3://{self.config.bucket_name}/{file}", encoding="utf-8") as f:
63
+ json_data = json.load(f)
64
+ data.append(pd.json_normalize(json_data))
65
+ except Exception as e:
66
+ raise Exception(f"Error processing file {file}: {str(e)}")
67
+ if not data:
68
+ return pd.DataFrame()
69
+ result = pd.concat(data, ignore_index=True)
70
+ if unique_columns:
71
+ result = result.drop_duplicates(subset=unique_columns)
72
+ return result
app/helpers/utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import urllib
2
+
3
+ def extract_audio_identifier(url: str) -> tuple:
4
+ parts = url.strip("/").split("/")
5
+ return urllib.parse.unquote(parts[-2]), int(parts[-1])
app/pages/Annotations/callbacks.py CHANGED
@@ -1,24 +1,43 @@
1
- from app import app
 
 
 
 
 
2
  from dash import Input, Output, State, no_update
3
  import datetime
4
  import os
5
  from pathlib import Path
6
- from utils import load_page_verses_and_audios, load_persistent_data, save_persistent_data
7
- from clients import upload_file_to_s3
8
- from global_vars import audio_paths , data, possible_values, BUCKET_NAME
 
 
 
 
 
9
 
 
 
 
10
  PERSIST_FILE = "results.json"
11
 
 
 
12
 
 
 
 
 
 
 
 
 
13
 
14
- def get_page_paths(chapter_path):
15
- """Retrieve all page folders in the chapter folder."""
16
- pages_dir = chapter_path
17
- if pages_dir.exists():
18
- return [d for d in pages_dir.iterdir() if d.is_dir()]
19
- return []
20
 
21
- # Step 1: Show chapter section after entering user info
 
 
22
  @app.callback(
23
  Output("chapter-section", "style"),
24
  Output("pseudo-continue-button", "style"),
@@ -26,11 +45,12 @@ def get_page_paths(chapter_path):
26
  State("user-info", "value")
27
  )
28
  def show_chapter_section(n_clicks, user_info):
 
29
  if n_clicks and user_info:
30
  return {"display": "block"}, {"display": "none"}
31
  return {"display": "none"}, {"display": "block"}
32
 
33
- # Step 2: Show page section after selecting a chapter
34
  @app.callback(
35
  Output("page-section", "style"),
36
  Output("chapter-continue-button", "style"),
@@ -38,11 +58,12 @@ def show_chapter_section(n_clicks, user_info):
38
  State("chapter-dropdown", "value")
39
  )
40
  def show_page_section(n_clicks, chapter_value):
 
41
  if n_clicks and chapter_value:
42
  return {"display": "block"}, {"display": "none"}
43
  return {"display": "none"}, {"display": "block"}
44
 
45
- # Step 3: Show transcription section after selecting a page
46
  @app.callback(
47
  Output("transcription-section", "style"),
48
  Output("start-button", "style"),
@@ -50,56 +71,67 @@ def show_page_section(n_clicks, chapter_value):
50
  State("page-dropdown", "value")
51
  )
52
  def show_transcription_section(n_clicks, page_value):
 
53
  if n_clicks and page_value:
54
  return {"display": "block"}, {"display": "none"}
55
  return {"display": "none"}, {"display": "block"}
56
 
57
- # Update page dropdown options based on selected chapter
 
 
 
58
  @app.callback(
59
  Output("page-dropdown", "options"),
60
  Input("chapter-dropdown", "value")
61
  )
62
  def update_pages(chapter_value):
 
63
  if chapter_value:
64
  chapter_path = Path(chapter_value)
65
- pages = get_page_paths(chapter_path)
66
- options = [{"label": d.name, "value": str(d)} for d in pages]
67
- return options
68
  return []
69
 
70
- # Update audio player and suggestions when a page is selected
 
 
 
71
  @app.callback(
 
 
72
  Output("audio-player", "src"),
73
  Output("suggestion-checklist", "options"),
74
- Output("hidden-message", "style"), # New output to control the hidden message
75
-
76
  Input("page-dropdown", "value"),
77
  State("chapter-dropdown", "value")
78
  )
79
-
80
  def update_audio_and_suggestions(page_value, chapter_value):
81
- global audio_paths, possible_values
82
-
83
- hidden_message_style = {"display": "none"} # Hide the hidden message
84
-
 
85
  if page_value and chapter_value:
86
- possible_values, audio_paths = load_page_verses_and_audios(page_value, data)
87
- next_options = [{"label": t, "value": t} for t in possible_values[:6]]
88
- if len(audio_paths)>0: #control end of page
89
- return audio_paths[0], next_options, hidden_message_style
90
- hidden_message_style = {"display": "block"} # Show the hidden message
91
- try:
 
92
  os.remove(PERSIST_FILE)
93
- except:
94
- pass
95
- return no_update, no_update, hidden_message_style
96
 
97
- return no_update, no_update, hidden_message_style
98
 
 
 
 
99
  @app.callback(
 
 
100
  Output("audio-player", "src", allow_duplicate=True),
101
  Output("suggestion-checklist", "options", allow_duplicate=True),
102
- Output("suggestion-checklist", "value", allow_duplicate=True), # Reset the checklist
103
  Output("confirmation-message", "children"),
104
  Output("transcription-store", "data"),
105
  Input("submit-button", "n_clicks"),
@@ -107,54 +139,51 @@ def update_audio_and_suggestions(page_value, chapter_value):
107
  State("user-info", "value"),
108
  State("page-dropdown", "value"),
109
  State("audio-player", "src"),
 
 
110
  State("transcription-store", "data"),
111
  prevent_initial_call=True
112
  )
113
- def update_transcription(n_clicks, selected_transcriptions, user_info, page_value, current_audio, stored_transcriptions):
 
 
 
 
 
 
 
114
  if n_clicks > 0 and page_value and current_audio:
115
- # Use the audio path as the unique segment identifier
116
- segment_path = current_audio
117
-
118
- # Create a new transcription entry with a timestamp
119
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
120
- entry = {
121
- "segment_path": segment_path,
 
122
  "transcriptions": selected_transcriptions,
123
  "timestamp": timestamp,
124
  "user_id": user_info
125
- }
126
- stored_transcriptions.append(entry)
127
-
128
- # Remove the current audio from the list and load the next segment
129
- if audio_paths:
130
- audio_paths.pop(0) # Remove the current audio
131
- next_audio = audio_paths[0] if audio_paths else ""
132
  else:
133
  next_audio = ""
134
-
135
- # Remove the used transcription suggestions so they don't reappear
136
- for value in selected_transcriptions:
137
- if value in possible_values:
138
- possible_values.remove(value)
139
-
140
- # Prepare the next set of suggestions
141
- next_options = [{"label": t, "value": t} for t in possible_values[:6] + ["autre transcription"]]
142
-
143
- # Build a confirmation message
144
- confirmation_message = (
145
- f"Transcriptions sélectionnées : {', '.join(selected_transcriptions)}"
146
- if selected_transcriptions else "Aucune transcription sélectionnée."
147
- )
148
-
149
-
150
- # IMPORTANT: Return an empty list for the checklist's value so that it is reset
151
- return next_audio, next_options, [], confirmation_message, stored_transcriptions
152
-
153
-
154
- return no_update, no_update, no_update, no_update, no_update
155
-
156
-
157
-
158
  @app.callback(
159
  Output("confirmation-message", "children", allow_duplicate=True),
160
  Input("save-results-button", "n_clicks"),
@@ -163,21 +192,21 @@ def update_transcription(n_clicks, selected_transcriptions, user_info, page_valu
163
  prevent_initial_call=True
164
  )
165
  def save_results(n_clicks, page_value, stored_transcriptions):
 
 
 
 
166
  if n_clicks > 0 and page_value:
167
-
168
  try:
169
- initial_transcriptions = load_persistent_data(PERSIST_FILE)
170
  except Exception as e:
171
- print(f"No existing transcription found or error downloading: {e}")
172
  initial_transcriptions = []
173
- combined_transcriptions = initial_transcriptions + stored_transcriptions
174
- save_persistent_data(combined_transcriptions, PERSIST_FILE)
175
- page_value=page_value.replace('\\', '/').replace("assets/", "")
176
- s3_key = f"labelling/{page_value}/{PERSIST_FILE}"
177
- upload_file_to_s3(PERSIST_FILE, BUCKET_NAME, s3_key)
178
-
179
- # Confirmation message
180
- confirmation_message = "Les résultats ont été sauvegardés avec succès."
181
- return confirmation_message
182
-
183
- return no_update
 
1
+ """
2
+ callbacks.py
3
+ -------------
4
+ Ce module définit les callbacks de l'application Dash en utilisant les abstractions
5
+ définies dans le sous-package helpers."""
6
+
7
  from dash import Input, Output, State, no_update
8
  import datetime
9
  import os
10
  from pathlib import Path
11
+ from loguru import logger
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
+
16
+ from helpers.processor import Processor
17
+ from helpers.s3 import S3Client
18
+ from helpers.models import S3Config
19
 
20
+ from app import app
21
+
22
+ from global_vars import data, BUCKET_NAME
23
  PERSIST_FILE = "results.json"
24
 
25
+ # Instanciation du Processor
26
+ processor = Processor()
27
 
28
+ # Instanciation du client S3 à partir de la configuration
29
+ s3_config = S3Config(
30
+ bucket_name=BUCKET_NAME,
31
+ endpoint_url=os.getenv("AWS_ENDPOINT_URL_S3"),
32
+ access_key=os.getenv("AWS_ACCESS_KEY_ID"),
33
+ secret_key=os.getenv("AWS_SECRET_ACCESS_KEY")
34
+ )
35
+ s3_client = S3Client(s3_config)
36
 
 
 
 
 
 
 
37
 
38
+ # -----------------------------------------------------------------------------
39
+ # CALLBACKS D'AFFICHAGE DES SECTIONS
40
+ # -----------------------------------------------------------------------------
41
  @app.callback(
42
  Output("chapter-section", "style"),
43
  Output("pseudo-continue-button", "style"),
 
45
  State("user-info", "value")
46
  )
47
  def show_chapter_section(n_clicks, user_info):
48
+ """Affiche la section chapitre après saisie d'une information utilisateur."""
49
  if n_clicks and user_info:
50
  return {"display": "block"}, {"display": "none"}
51
  return {"display": "none"}, {"display": "block"}
52
 
53
+
54
  @app.callback(
55
  Output("page-section", "style"),
56
  Output("chapter-continue-button", "style"),
 
58
  State("chapter-dropdown", "value")
59
  )
60
  def show_page_section(n_clicks, chapter_value):
61
+ """Affiche la section page après sélection d'un chapitre."""
62
  if n_clicks and chapter_value:
63
  return {"display": "block"}, {"display": "none"}
64
  return {"display": "none"}, {"display": "block"}
65
 
66
+
67
  @app.callback(
68
  Output("transcription-section", "style"),
69
  Output("start-button", "style"),
 
71
  State("page-dropdown", "value")
72
  )
73
  def show_transcription_section(n_clicks, page_value):
74
+ """Affiche la section transcription après sélection d'une page."""
75
  if n_clicks and page_value:
76
  return {"display": "block"}, {"display": "none"}
77
  return {"display": "none"}, {"display": "block"}
78
 
79
+
80
+ # -----------------------------------------------------------------------------
81
+ # CALLBACK DE MISE À JOUR DU DROPDOWN DES PAGES
82
+ # -----------------------------------------------------------------------------
83
  @app.callback(
84
  Output("page-dropdown", "options"),
85
  Input("chapter-dropdown", "value")
86
  )
87
  def update_pages(chapter_value):
88
+ """Met à jour les options du dropdown de pages selon le chapitre sélectionné."""
89
  if chapter_value:
90
  chapter_path = Path(chapter_value)
91
+ pages = [d for d in chapter_path.iterdir() if d.is_dir()] if chapter_path.exists() else []
92
+ return [{"label": d.name, "value": str(d)} for d in pages]
 
93
  return []
94
 
95
+
96
+ # -----------------------------------------------------------------------------
97
+ # CALLBACK DE MISE À JOUR AUDIO ET DES SUGGESTIONS (via dcc.Store)
98
+ # -----------------------------------------------------------------------------
99
  @app.callback(
100
+ Output("audio-store", "data"),
101
+ Output("values-store", "data"),
102
  Output("audio-player", "src"),
103
  Output("suggestion-checklist", "options"),
104
+ Output("hidden-message", "style"),
 
105
  Input("page-dropdown", "value"),
106
  State("chapter-dropdown", "value")
107
  )
 
108
  def update_audio_and_suggestions(page_value, chapter_value):
109
+ """
110
+ Met à jour les stores pour les chemins audio et les suggestions.
111
+ Affiche le premier segment audio et les 6 premières suggestions.
112
+ """
113
+ hidden_style = {"display": "none"}
114
  if page_value and chapter_value:
115
+ # Utilise la méthode abstraite pour charger les transcriptions et extraire l'état
116
+ possible_values, audio_paths = processor.load_page_verses_and_audios(s3_client, page_value, data)
117
+ options = [{"label": t, "value": t} for t in possible_values[:6]]
118
+ audio_src = audio_paths[0] if audio_paths else no_update
119
+ return audio_paths, possible_values, audio_src, options, hidden_style
120
+ else:
121
+ if os.path.exists(PERSIST_FILE):
122
  os.remove(PERSIST_FILE)
123
+ return no_update, no_update, no_update, no_update, {"display": "block"}
 
 
124
 
 
125
 
126
+ # -----------------------------------------------------------------------------
127
+ # CALLBACK POUR LE TRAITEMENT DE LA TRANSCIPTION
128
+ # -----------------------------------------------------------------------------
129
  @app.callback(
130
+ Output("audio-store", "data", allow_duplicate=True),
131
+ Output("values-store", "data", allow_duplicate=True),
132
  Output("audio-player", "src", allow_duplicate=True),
133
  Output("suggestion-checklist", "options", allow_duplicate=True),
134
+ Output("suggestion-checklist", "value", allow_duplicate=True),
135
  Output("confirmation-message", "children"),
136
  Output("transcription-store", "data"),
137
  Input("submit-button", "n_clicks"),
 
139
  State("user-info", "value"),
140
  State("page-dropdown", "value"),
141
  State("audio-player", "src"),
142
+ State("audio-store", "data"),
143
+ State("values-store", "data"),
144
  State("transcription-store", "data"),
145
  prevent_initial_call=True
146
  )
147
+ def update_transcription(n_clicks, selected_transcriptions, user_info, page_value,
148
+ current_audio, audio_store, values_store, stored_transcriptions):
149
+ """
150
+ Traite la soumission d'une transcription :
151
+ - Ajoute l'entrée avec timestamp dans le store de transcription.
152
+ - Retire le segment audio traité et les suggestions utilisées.
153
+ - Met à jour l'audio et les options de la checklist.
154
+ """
155
  if n_clicks > 0 and page_value and current_audio:
 
 
 
 
156
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
157
+ stored_transcriptions = stored_transcriptions if stored_transcriptions is not None else []
158
+ stored_transcriptions.append({
159
+ "segment_path": current_audio,
160
  "transcriptions": selected_transcriptions,
161
  "timestamp": timestamp,
162
  "user_id": user_info
163
+ })
164
+ # Mise à jour du store audio
165
+ if audio_store and isinstance(audio_store, list):
166
+ audio_store.pop(0)
167
+ next_audio = audio_store[0] if audio_store else ""
 
 
168
  else:
169
  next_audio = ""
170
+ # Mise à jour du store de suggestions
171
+ if values_store and isinstance(values_store, list):
172
+ for val in selected_transcriptions:
173
+ if val in values_store:
174
+ values_store.remove(val)
175
+ next_options = [{"label": t, "value": t} for t in (values_store[:6] + ["autre transcription"])] if values_store else []
176
+ confirmation_message = (f"Transcriptions sélectionnées : {', '.join(selected_transcriptions)}"
177
+ if selected_transcriptions else "Aucune transcription sélectionnée.")
178
+ # Réinitialisation de la checklist
179
+ if len(next_options)>1:
180
+ return audio_store, values_store, next_audio, next_options, [], confirmation_message, stored_transcriptions
181
+ return no_update, no_update, no_update, no_update, no_update, no_update, no_update
182
+
183
+
184
+ # -----------------------------------------------------------------------------
185
+ # CALLBACK POUR LA SAUVEGARDE DES RÉSULTATS
186
+ # -----------------------------------------------------------------------------
 
 
 
 
 
 
 
187
  @app.callback(
188
  Output("confirmation-message", "children", allow_duplicate=True),
189
  Input("save-results-button", "n_clicks"),
 
192
  prevent_initial_call=True
193
  )
194
  def save_results(n_clicks, page_value, stored_transcriptions):
195
+ """
196
+ Sauvegarde les transcriptions en combinant les données persistantes existantes
197
+ avec les nouvelles et en les uploadant sur S3.
198
+ """
199
  if n_clicks > 0 and page_value:
 
200
  try:
201
+ initial_transcriptions = processor.load_persistent_data(PERSIST_FILE)
202
  except Exception as e:
203
+ logger.error(f"Erreur lors du chargement des données persistantes : {e}")
204
  initial_transcriptions = []
205
+ combined_transcriptions = initial_transcriptions + (stored_transcriptions if stored_transcriptions else [])
206
+ processor.save_persistent_data(combined_transcriptions, PERSIST_FILE)
207
+ cleaned_page = page_value.replace("\\", "/").replace("assets/", "")
208
+ s3_key = f"labelling/{cleaned_page}/{PERSIST_FILE}"
209
+ s3_client.upload_file(PERSIST_FILE, s3_key)
210
+
211
+ return "Les résultats ont été sauvegardés avec succès."
212
+ return no_update
 
 
 
app/pages/Annotations/layout.py CHANGED
@@ -1,237 +1,224 @@
1
- from pathlib import Path
2
  import dash_bootstrap_components as dbc
3
  from dash import dcc, html
4
- from global_vars import audio_paths, possible_values
5
-
6
  from dotenv import load_dotenv
7
 
8
  load_dotenv()
9
  BUCKET_NAME = "moore-collection"
10
 
11
- # Configuration
12
- AUDIO_FOLDER = Path("./assets/assets")
13
- PERSIST_FILE = "results.json"
14
-
15
-
16
- # Helper functions
17
  def get_chapter_paths(base_folder):
18
- """Retrieve all chapter folders in the base folder."""
19
  base = Path(base_folder)
20
  return [d for d in base.iterdir() if d.is_dir()]
21
 
22
-
23
  chapters = get_chapter_paths("assets")
24
  chapter_options = [{"label": d.name, "value": str(d)} for d in chapters]
25
 
 
26
 
27
- # Layout
28
- def create_layout(audio_paths, possible_values):
29
- """Create and return the main layout of the application."""
30
- return dbc.Container(
31
- [
32
- dbc.Row(
33
- [
34
- dbc.Col(
35
- html.H1(
36
- "Outil de transcription audio",
37
- className="text-center my-4 text-primary",
38
- ),
39
- width=12,
40
- )
41
- ]
42
  ),
43
- # Step 1: Enter user info
44
- dbc.Row(
45
- [
46
- dbc.Col(
47
- [
48
- dbc.Input(
49
- id="user-info",
50
- placeholder="Entrez votre email, pseudonyme ou nom pour qu'on vous crédite",
51
- type="text",
52
- className="mb-3",
53
- ),
54
- dbc.Button(
55
- "Continuer",
56
- id="pseudo-continue-button",
57
- color="primary",
58
- className="w-100",
59
- ),
60
- ],
61
- width=12,
62
- )
63
- ]
64
- ),
65
- # Step 2: Chapter selection (initially hidden)
66
- dbc.Row(
67
- id="chapter-section",
68
- style={"display": "none"},
69
- children=[
70
- dbc.Col(
71
- [
72
- dbc.Card(
73
- [
74
- dbc.CardHeader("Sélectionnez un chapitre"),
75
- dbc.CardBody(
76
- [
77
- dcc.Dropdown(
78
- id="chapter-dropdown",
79
- options=chapter_options,
80
- placeholder="Choisissez un chapitre",
81
- ),
82
- dbc.Button(
83
- "Continuer",
84
- id="chapter-continue-button",
85
- color="primary",
86
- className="w-100 mt-2",
87
- ),
88
- ]
89
- ),
90
- ]
91
- )
92
- ],
93
- width=12,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  )
95
- ],
96
  ),
97
- # Step 3: Page selection (initially hidden)
98
- dbc.Row(
99
- id="page-section",
100
- style={"display": "none"},
101
- children=[
102
- dbc.Col(
103
- [
104
- dbc.Card(
105
- [
106
- dbc.CardHeader("Sélectionnez une page"),
107
- dbc.CardBody(
108
- [
109
- dcc.Dropdown(
110
- id="page-dropdown",
111
- placeholder="Choisissez une page",
112
- ),
113
- dbc.Button(
114
- "Démarrer la transcription",
115
- id="start-button",
116
- color="primary",
117
- className="w-100 mt-2",
118
- ),
119
- ]
120
- ),
121
- ]
122
- )
123
- ],
124
- width=12,
125
- )
126
- ],
127
  ),
128
- # Step 4: Transcription section (initially hidden)
129
- dbc.Row(
130
- id="transcription-section",
131
- style={"display": "none"},
132
- children=[
133
- dbc.Col(
134
- [
135
- dbc.Card(
136
- [
137
- dbc.CardHeader("Lecture audio"),
138
- dbc.CardBody(
139
- [
140
- dcc.Loading(
141
- html.Audio(
142
- id="audio-player",
143
- controls=True,
144
- autoPlay=False,
145
- className="w-100",
146
- )
147
- )
148
- ]
149
- ),
150
- ],
151
- className="mb-4 shadow",
152
- )
153
- ],
154
- width=12,
155
- ),
156
- dbc.Col(
157
- [
158
- dbc.Card(
159
- [
160
- dbc.CardHeader("Suggestions de transcriptions"),
161
- dbc.CardBody(
162
- [
163
- dcc.Checklist(
164
- id="suggestion-checklist",
165
- options=[
166
- {"label": t, "value": t}
167
- for t in possible_values[:6]
168
- ],
169
- value=[],
170
- style={
171
- "columns": "3",
172
- "column-gap": "1rem",
173
- },
174
- )
175
- ]
176
- ),
177
- ],
178
- className="mb-4 shadow",
179
- )
180
- ],
181
- width=12,
182
- ),
183
- dbc.Col([
184
- html.Div(
185
- id="hidden-message",
186
- style={"display": "none"}, # Initially hidden
187
- children=[
188
- html.P("Traitement de la page actuelle terminée, vous devez changer de page pour continuer. N'oubliez pas de sauvegarder.", style={"color": "red"})
189
- ]
190
  )
191
- ], width=12),
192
- dbc.Col(
193
- [
194
- dbc.Button(
195
- "Soumettre",
196
- id="submit-button",
197
- n_clicks=0,
198
- color="secondary",
199
- className="w-100",
200
- style={"marginTop": "20px"},
201
- ),
202
- dbc.Button(
203
- "Sauvegarder résultats",
204
- id="save-results-button",
205
- n_clicks=0,
206
- color="success",
207
- className="w-100",
208
- style={"marginTop": "20px"},
209
- ),
210
- ],
211
- width=12,
212
- ),
213
- dbc.Col(
214
- [
215
- html.Div(
216
- id="confirmation-message",
217
- className="text-success text-center mt-3",
218
- )
219
- ],
220
- width=12,
221
- ),
222
- dbc.Row(
223
- [
224
- dcc.Store(
225
- id="transcription-store", data=[]
226
- ), # Store for temporary transcriptions
227
- ]
228
- ),
229
- ],
230
  ),
231
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  )
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- layout = create_layout(audio_paths, possible_values)
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- from .callbacks import *
 
 
 
 
1
  import dash_bootstrap_components as dbc
2
  from dash import dcc, html
3
+ from pathlib import Path
 
4
  from dotenv import load_dotenv
5
 
6
  load_dotenv()
7
  BUCKET_NAME = "moore-collection"
8
 
9
+ # --- Helper pour récupérer les chapitres ---
 
 
 
 
 
10
  def get_chapter_paths(base_folder):
 
11
  base = Path(base_folder)
12
  return [d for d in base.iterdir() if d.is_dir()]
13
 
 
14
  chapters = get_chapter_paths("assets")
15
  chapter_options = [{"label": d.name, "value": str(d)} for d in chapters]
16
 
17
+ # --- Fonctions de création des différentes cards ---
18
 
19
+ def header_card():
20
+ """Carte d'en-tête avec le titre de l'application."""
21
+ return dbc.Row(
22
+ dbc.Col(
23
+ html.H1(
24
+ "Outil de transcription audio",
25
+ className="text-center my-4 text-primary",
 
 
 
 
 
 
 
 
26
  ),
27
+ width=12,
28
+ )
29
+ )
30
+
31
+ def user_info_card():
32
+ """Carte pour la saisie des informations utilisateur."""
33
+ return dbc.Row(
34
+ dbc.Col(
35
+ [
36
+ dbc.Input(
37
+ id="user-info",
38
+ placeholder="Entrez votre email, pseudonyme ou nom pour qu'on vous crédite",
39
+ type="text",
40
+ className="mb-3",
41
+ ),
42
+ dbc.Button(
43
+ "Continuer",
44
+ id="pseudo-continue-button",
45
+ color="primary",
46
+ className="w-100",
47
+ ),
48
+ ],
49
+ width=12,
50
+ )
51
+ )
52
+
53
+ def chapter_card():
54
+ """Carte pour la sélection d'un chapitre."""
55
+ return dbc.Row(
56
+ id="chapter-section",
57
+ style={"display": "none"},
58
+ children=[
59
+ dbc.Col(
60
+ dbc.Card(
61
+ [
62
+ dbc.CardHeader("Sélectionnez un chapitre"),
63
+ dbc.CardBody(
64
+ [
65
+ dcc.Dropdown(
66
+ id="chapter-dropdown",
67
+ options=chapter_options,
68
+ placeholder="Choisissez un chapitre",
69
+ ),
70
+ dbc.Button(
71
+ "Continuer",
72
+ id="chapter-continue-button",
73
+ color="primary",
74
+ className="w-100 mt-2",
75
+ ),
76
+ ]
77
+ ),
78
+ ]
79
+ ),
80
+ width=12,
81
+ )
82
+ ],
83
+ )
84
+
85
+ def page_card():
86
+ """Carte pour la sélection d'une page."""
87
+ return dbc.Row(
88
+ id="page-section",
89
+ style={"display": "none"},
90
+ children=[
91
+ dbc.Col(
92
+ dbc.Card(
93
+ [
94
+ dbc.CardHeader("Sélectionnez une page"),
95
+ dbc.CardBody(
96
+ [
97
+ dcc.Dropdown(
98
+ id="page-dropdown",
99
+ placeholder="Choisissez une page",
100
+ ),
101
+ dbc.Button(
102
+ "Démarrer la transcription",
103
+ id="start-button",
104
+ color="primary",
105
+ className="w-100 mt-2",
106
+ ),
107
+ ]
108
+ ),
109
+ ]
110
+ ),
111
+ width=12,
112
+ )
113
+ ],
114
+ )
115
+
116
+ def transcription_card():
117
+ """Carte regroupant la lecture audio, les suggestions et les actions de transcription."""
118
+ audio_card = dbc.Card(
119
+ [
120
+ dbc.CardHeader("Lecture audio"),
121
+ dbc.CardBody(
122
+ dcc.Loading(
123
+ html.Audio(
124
+ id="audio-player",
125
+ controls=True,
126
+ autoPlay=False,
127
+ className="w-100",
128
  )
129
+ )
130
  ),
131
+ ],
132
+ className="mb-4 shadow",
133
+ )
134
+
135
+ suggestion_card = dbc.Card(
136
+ [
137
+ dbc.CardHeader("Suggestions de transcriptions"),
138
+ dbc.CardBody(
139
+ dcc.Checklist(
140
+ id="suggestion-checklist",
141
+ options=[], # Initialement vide, sera mis à jour via callback
142
+ value=[],
143
+ style={"columns": "3", "column-gap": "1rem"},
144
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  ),
146
+ ],
147
+ className="mb-4 shadow",
148
+ )
149
+
150
+ hidden_message = html.Div(
151
+ id="hidden-message",
152
+ style={"display": "none"},
153
+ children=[
154
+ html.P(
155
+ "Traitement de la page actuelle terminé, vous devez changer de page pour continuer. N'oubliez pas de sauvegarder.",
156
+ style={"color": "red"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  )
158
+ ],
159
+ )
160
+
161
+ action_buttons = dbc.Col(
162
+ [
163
+ dbc.Button(
164
+ "Soumettre",
165
+ id="submit-button",
166
+ n_clicks=0,
167
+ color="secondary",
168
+ className="w-100",
169
+ style={"marginTop": "20px"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  ),
171
+ dbc.Button(
172
+ "Sauvegarder résultats",
173
+ id="save-results-button",
174
+ n_clicks=0,
175
+ color="success",
176
+ className="w-100",
177
+ style={"marginTop": "20px"},
178
+ ),
179
+ ],
180
+ width=12,
181
+ )
182
+
183
+ confirmation_message = dbc.Col(
184
+ html.Div(
185
+ id="confirmation-message",
186
+ className="text-success text-center mt-3",
187
+ ),
188
+ width=12,
189
  )
190
 
191
+ # La carte de transcription regroupe plusieurs composants et dcc.Store pour l'état
192
+ return dbc.Row(
193
+ id="transcription-section",
194
+ style={"display": "none"},
195
+ children=[
196
+ dbc.Col(audio_card, width=12),
197
+ dbc.Col(suggestion_card, width=12),
198
+ dbc.Col(hidden_message, width=12),
199
+ action_buttons,
200
+ confirmation_message,
201
+ # Stores pour l'état de l'application
202
+ dcc.Store(id="transcription-store", data=[]),
203
+ dcc.Store(id="audio-store", data=[]),
204
+ dcc.Store(id="values-store", data=[]),
205
+ ],
206
+ )
207
 
208
+ def create_layout():
209
+ """Compose le layout principal à partir des différentes cards."""
210
+ return dbc.Container(
211
+ [
212
+ header_card(),
213
+ user_info_card(),
214
+ chapter_card(),
215
+ page_card(),
216
+ transcription_card(),
217
+ ],
218
+ fluid=True,
219
+ className="p-4",
220
+ )
221
 
222
+ # Initialisation du layout
223
+ layout = create_layout()
224
+ from .callbacks import *
app/pages/Annotations/state.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+ import pandas as pd
4
+
5
+ @dataclass
6
+ class AppState:
7
+ audio_paths: List[str] = None
8
+ possible_values: List[str] = None
9
+ current_chapter: Optional[str] = None
10
+ current_page: Optional[str] = None
11
+ verses_data: Optional[pd.DataFrame] = None
12
+
13
+ @classmethod
14
+ def initialize(cls, dataset_name: str):
15
+ from datasets import load_dataset
16
+ data = load_dataset(dataset_name, split="train").to_pandas()
17
+ data[["chapter", "page"]] = data["moore_source_url"].apply(
18
+ lambda x: pd.Series(extract_audio_identifier(x))
19
+ )
20
+ return cls(verses_data=data)
app/pages/base_page.py CHANGED
@@ -17,7 +17,7 @@ class PageMeta:
17
  parts = self.module_name.split(".")
18
  if len(parts) < 2:
19
  raise ValueError(f"Invalid module name: {self.module_name}")
20
- self._id = parts[-1] # Use the last part as the ID
21
  return self._id
22
 
23
  @property
@@ -29,7 +29,7 @@ class PageMeta:
29
  @property
30
  def path(self) -> str:
31
  if not self._path:
32
- self._path = route_prefix + "/" + self.id_
33
  return self._path
34
 
35
  @classmethod
 
17
  parts = self.module_name.split(".")
18
  if len(parts) < 2:
19
  raise ValueError(f"Invalid module name: {self.module_name}")
20
+ self._id = parts[1]
21
  return self._id
22
 
23
  @property
 
29
  @property
30
  def path(self) -> str:
31
  if not self._path:
32
+ self._path = route_prefix + self.id_
33
  return self._path
34
 
35
  @classmethod