Spaces:

sawadogosalif
/

MooreFRCollections-Annotations

Runtime error

App Files Files Community

Salif SAWADOGO commited on Feb 14

Commit

7204409

1 Parent(s): f27ce21

⚡️ improve code quality and use dcc instead global variable

Browse files

Files changed (12) hide show

.gitignore +1 -1
app/global_vars.py +0 -1
app/helpers/__init__.py +0 -0
app/helpers/abstracts.py +40 -0
app/helpers/models.py +12 -0
app/helpers/processor.py +99 -0
app/helpers/s3.py +72 -0
app/helpers/utils.py +5 -0
app/pages/Annotations/callbacks.py +117 -88
app/pages/Annotations/layout.py +199 -212
app/pages/Annotations/state.py +20 -0
app/pages/base_page.py +2 -2

.gitignore CHANGED Viewed

@@ -1,7 +1,7 @@
 .venv
 **__pycache__**
 **.pyc**
-**/env/**
 **/.venv/**
 **/*.egg-info
 **/*parquet

 .venv
 **__pycache__**
 **.pyc**
+**/*env/**
 **/.venv/**
 **/*.egg-info
 **/*parquet

app/global_vars.py CHANGED Viewed

@@ -4,7 +4,6 @@ from clients import s3_loader
 from utils import extract_audio_identifier
 DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
-audio_paths, possible_values = [], []
 data = load_dataset(DATA_FILE, split="train").to_pandas()
 data[["chapter", "page"]] = data["moore_source_url"].apply(
     lambda x: pd.Series(extract_audio_identifier(x))

 from utils import extract_audio_identifier
 DATA_FILE = "sawadogosalif/MooreFRCollections_BibleOnlyText"
 data = load_dataset(DATA_FILE, split="train").to_pandas()
 data[["chapter", "page"]] = data["moore_source_url"].apply(
     lambda x: pd.Series(extract_audio_identifier(x))

app/helpers/__init__.py ADDED Viewed

File without changes

app/helpers/abstracts.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from abc import ABC, abstractmethod
+from typing import List
+import pandas as pd
+class AbstractS3Client(ABC):
+    @abstractmethod
+    def upload_file(self, local_path: str, s3_key: str) -> None:
+        """Upload un fichier vers S3."""
+        pass
+    @abstractmethod
+    def download_file(self, local_path: str, s3_key: str) -> None:
+        """Télécharge un fichier depuis S3."""
+        pass
+    @abstractmethod
+    def list_files(self, prefix: str = "") -> List[str]:
+        """Liste les fichiers dans S3 sous un préfixe donné."""
+        pass
+    @abstractmethod
+    def load_json_files(self, files: List[str], unique_columns: List[str] = None) -> pd.DataFrame:
+        """Charge et combine des fichiers JSON en un DataFrame."""
+        pass
+class AbstractProcessor(ABC):
+    @abstractmethod
+    def get_audio_paths(self, folder: str) -> list[str]:
+        """Retourne et trie les chemins audio d'un dossier."""
+        pass
+    @abstractmethod
+    def process_text(self, text: str) -> str:
+        """Nettoie et traite un texte."""
+        pass
+    @abstractmethod
+    def splitter(self, text: str) -> list[str]:
+        """Divise un texte en segments."""
+        pass

app/helpers/models.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# helpers/models.py
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class S3Config:
+    """Configuration for S3 connection."""
+    bucket_name: str
+    endpoint_url: str
+    access_key: str
+    secret_key: str
+    region_name: Optional[str] = None

app/helpers/processor.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import re
+import json
+import os
+import urllib.parse
+from pathlib import Path
+import pandas as pd
+from loguru import logger
+from .abstracts import AbstractProcessor
+from .s3 import S3Client  # Si besoin d'utiliser des fonctions S3
+class Processor(AbstractProcessor):
+    def get_audio_paths(self, folder: str) -> list[str]:
+        def extract_number(file_path: str) -> int:
+            match = re.search(r"segment_(\d+)", file_path)
+            return int(match.group(1)) if match else float("inf")
+        audio_paths = list(Path(folder).glob("*.mp3"))
+        audio_paths = [audio_path.as_posix() for audio_path in audio_paths]
+        audio_paths = sorted(audio_paths, key=extract_number)
+        return audio_paths[3:]
+    def process_text(self, text: str) -> str:
+        text = re.sub(r"\+\s*\.", ".", text)
+        text = re.sub(r"\*\s*\+\s*;", ";", text)
+        text = re.sub(r"\*\s*\+", "", text)
+        text = text.replace(" + ", " ").replace(" * ", " ").replace("+", " ")
+        text = re.sub(r'["“”]', "", text)
+        return text.strip()
+    def splitter(self, text: str) -> list[str]:
+        return re.split(r"[,:;.]", self.process_text(text))
+    # Ajoutez d'autres méthodes de traitement si nécessaire
+    def flatten_nested_values(self, nested_values: pd.Series) -> list[str]:
+        flattened = []
+        for group in nested_values:
+            for item in group:
+                cleaned_item = re.sub(r"^\d+\s*", "", item).strip()
+                if cleaned_item:
+                    flattened.append(cleaned_item)
+        return flattened
+    def load_persistent_data(self, file: str) -> list:
+        if os.path.exists(file):
+            with open(file, "r", encoding="utf-8") as f:
+                return json.load(f)
+        return []
+    def save_persistent_data(self, data: list, file: str) -> None:
+        with open(file, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+    def extract_audio_identifier(self, url: str):
+        parts = url.strip("/").split("/")
+        return urllib.parse.unquote(parts[-2]), int(parts[-1])
+    def find_and_return_after_last(self, long_list: list, short_list: list) -> list:
+        last_index = -1
+        for i, item in enumerate(long_list):
+            if item in short_list:
+                last_index = i
+        return long_list[last_index+1:] if last_index != -1 else long_list
+    def load_page_verses_and_audios(self, s3_client, page: str, df_verses: pd.DataFrame) -> tuple[list[str], list[str]]:
+        audio_paths = self.get_audio_paths(page)
+        page_fixed = page.replace("/", "\\")
+        _, chapter, page_str = page_fixed.split("\\")
+        s3_key = f"labelling/{chapter}/{page_str}/results.json"
+        page_int = int(page_str.replace("page_", ""))
+        tmp = df_verses[(df_verses.chapter == chapter) & (df_verses.page == page_int)]
+        possible_values = tmp["moore_verse_text"].apply(self.splitter)
+        possible_values = self.flatten_nested_values(possible_values)
+        try:
+            # Vous pouvez injecter ici un client S3 si besoin, ou utiliser directement une fonction d'un module dédié
+            s3_client.download_file("result.json",s3_key)
+            transcriptions = self.load_persistent_data("result.json")
+            latest_transcription = transcriptions[-1].get("transcriptions")
+            latest_audio = [transcriptions[-1].get("segment_path")]
+            audio_paths = self.find_and_return_after_last(audio_paths, latest_audio)
+            possible_values = self.find_and_return_after_last(possible_values, latest_transcription)
+            logger.info(f"Latest transcription: {latest_audio} / {latest_transcription}")
+            return possible_values, audio_paths
+        except Exception as e:
+            logger.error(f"An error occurred: {e}")
+            return possible_values, audio_paths
+    def get_contribution_data(self, s3_client) -> pd.DataFrame:
+        files = s3_client.list_files("labelling")
+        files = [file for file in files if file.endswith("json")]
+        try:
+            df = s3_client.load_json_files(files=files, unique_columns=["segment_path", "user_id"])
+            df[["tmp1", "chapter", "page", "segment"]] = df.segment_path.str.split("/", expand=True)
+            return (
+                df.sort_values(["chapter", "page"]).drop(columns=["tmp1", "segment_path"])
+                if not df.empty
+                else pd.DataFrame()
+            )
+        except Exception as e:
+            logger.error(f"Error in get_contribution_data: {e}")
+            return pd.DataFrame()

app/helpers/s3.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import json
+import boto3
+import s3fs
+from typing import List, Optional
+from loguru import logger
+from dotenv import load_dotenv
+from .models import S3Config
+from .abstracts import AbstractS3Client
+import pandas as pd
+load_dotenv()
+class S3Client(AbstractS3Client):
+    def __init__(self, config: S3Config):
+        self.config = config
+        self._fs = s3fs.S3FileSystem(
+            key=self.config.access_key,
+            secret=self.config.secret_key,
+            client_kwargs=(
+                {
+                    "endpoint_url": self.config.endpoint_url,
+                    "region_name": self.config.region_name,
+                }
+                if self.config.region_name
+                else {"endpoint_url": self.config.endpoint_url}
+            ),
+        )
+        self._client = boto3.client(
+            "s3",
+            endpoint_url=self.config.endpoint_url,
+            aws_access_key_id=self.config.access_key,
+            aws_secret_access_key=self.config.secret_key,
+        )
+    def upload_file(self, local_path: str, s3_key: str) -> None:
+        if os.path.exists(local_path):
+            self._client.upload_file(local_path, self.config.bucket_name, s3_key)
+            logger.info(f"Uploaded {local_path} to s3://{self.config.bucket_name}/{s3_key}")
+        else:
+            logger.error(f"File {local_path} does not exist.")
+    def download_file(self, local_path: str, s3_key: str) -> None:
+        self._client.download_file(self.config.bucket_name, s3_key, local_path)
+        logger.info(f"Downloaded {s3_key} to {local_path}")
+    def list_files(self, prefix: str = "") -> List[str]:
+        paginator = self._client.get_paginator("list_objects_v2")
+        files = []
+        try:
+            for page in paginator.paginate(Bucket=self.config.bucket_name, Prefix=prefix):
+                for obj in page.get("Contents", []):
+                    files.append(obj["Key"])
+        except Exception as e:
+            raise Exception(f"Error listing files in bucket {self.config.bucket_name}: {str(e)}")
+        return files
+    def load_json_files(self, files: List[str], unique_columns: Optional[List[str]] = None) -> pd.DataFrame:
+        data = []
+        for file in files:
+            try:
+                with self._fs.open(f"s3://{self.config.bucket_name}/{file}", encoding="utf-8") as f:
+                    json_data = json.load(f)
+                    data.append(pd.json_normalize(json_data))
+            except Exception as e:
+                raise Exception(f"Error processing file {file}: {str(e)}")
+        if not data:
+            return pd.DataFrame()
+        result = pd.concat(data, ignore_index=True)
+        if unique_columns:
+            result = result.drop_duplicates(subset=unique_columns)
+        return result

app/helpers/utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import urllib
+def extract_audio_identifier(url: str) -> tuple:
+    parts = url.strip("/").split("/")
+    return urllib.parse.unquote(parts[-2]), int(parts[-1])

app/pages/Annotations/callbacks.py CHANGED Viewed

@@ -1,24 +1,43 @@
-from app import app
 from dash import Input, Output, State, no_update
 import datetime
 import os
 from pathlib import Path
-from utils import load_page_verses_and_audios, load_persistent_data, save_persistent_data
-from clients import  upload_file_to_s3
-from global_vars import audio_paths , data, possible_values, BUCKET_NAME
 PERSIST_FILE = "results.json"
-def get_page_paths(chapter_path):
-    """Retrieve all page folders in the chapter folder."""
-    pages_dir = chapter_path
-    if pages_dir.exists():
-        return [d for d in pages_dir.iterdir() if d.is_dir()]
-    return []
-# Step 1: Show chapter section after entering user info
 @app.callback(
     Output("chapter-section", "style"),
     Output("pseudo-continue-button", "style"),
@@ -26,11 +45,12 @@ def get_page_paths(chapter_path):
     State("user-info", "value")
 )
 def show_chapter_section(n_clicks, user_info):
     if n_clicks and user_info:
         return {"display": "block"}, {"display": "none"}
     return {"display": "none"}, {"display": "block"}
-# Step 2: Show page section after selecting a chapter
 @app.callback(
     Output("page-section", "style"),
     Output("chapter-continue-button", "style"),
@@ -38,11 +58,12 @@ def show_chapter_section(n_clicks, user_info):
     State("chapter-dropdown", "value")
 )
 def show_page_section(n_clicks, chapter_value):
     if n_clicks and chapter_value:
         return {"display": "block"}, {"display": "none"}
     return {"display": "none"}, {"display": "block"}
-# Step 3: Show transcription section after selecting a page
 @app.callback(
     Output("transcription-section", "style"),
     Output("start-button", "style"),
@@ -50,56 +71,67 @@ def show_page_section(n_clicks, chapter_value):
     State("page-dropdown", "value")
 )
 def show_transcription_section(n_clicks, page_value):
     if n_clicks and page_value:
         return {"display": "block"}, {"display": "none"}
     return {"display": "none"}, {"display": "block"}
-# Update page dropdown options based on selected chapter
 @app.callback(
     Output("page-dropdown", "options"),
     Input("chapter-dropdown", "value")
 )
 def update_pages(chapter_value):
     if chapter_value:
         chapter_path = Path(chapter_value)
-        pages = get_page_paths(chapter_path)
-        options = [{"label": d.name, "value": str(d)} for d in pages]
-        return options
     return []
-# Update audio player and suggestions when a page is selected
 @app.callback(
     Output("audio-player", "src"),
     Output("suggestion-checklist", "options"),
-    Output("hidden-message", "style"),  # New output to control the hidden message
     Input("page-dropdown", "value"),
     State("chapter-dropdown", "value")
 )
 def update_audio_and_suggestions(page_value, chapter_value):
-    global audio_paths, possible_values
-    hidden_message_style = {"display": "none"}  # Hide the hidden message
     if page_value and chapter_value:
-        possible_values, audio_paths = load_page_verses_and_audios(page_value, data)
-        next_options = [{"label": t, "value": t} for t in possible_values[:6]]
-        if len(audio_paths)>0: #control end of page
-            return audio_paths[0], next_options, hidden_message_style
-        hidden_message_style = {"display": "block"}  # Show the hidden message
-        try:
             os.remove(PERSIST_FILE)
-        except:
-            pass
-        return no_update, no_update, hidden_message_style
-    return no_update, no_update, hidden_message_style
 @app.callback(
     Output("audio-player", "src", allow_duplicate=True),
     Output("suggestion-checklist", "options", allow_duplicate=True),
-    Output("suggestion-checklist", "value", allow_duplicate=True),  # Reset the checklist
     Output("confirmation-message", "children"),
     Output("transcription-store", "data"),
     Input("submit-button", "n_clicks"),
@@ -107,54 +139,51 @@ def update_audio_and_suggestions(page_value, chapter_value):
     State("user-info", "value"),
     State("page-dropdown", "value"),
     State("audio-player", "src"),
     State("transcription-store", "data"),
     prevent_initial_call=True
 )
-def update_transcription(n_clicks, selected_transcriptions, user_info, page_value, current_audio, stored_transcriptions):
     if n_clicks > 0 and page_value and current_audio:
-        # Use the audio path as the unique segment identifier
-        segment_path = current_audio
-        # Create a new transcription entry with a timestamp
         timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        entry = {
-            "segment_path": segment_path,
             "transcriptions": selected_transcriptions,
             "timestamp": timestamp,
             "user_id": user_info
-        }
-        stored_transcriptions.append(entry)
-        # Remove the current audio from the list and load the next segment
-        if audio_paths:
-            audio_paths.pop(0)  # Remove the current audio
-            next_audio = audio_paths[0] if audio_paths else ""
         else:
             next_audio = ""
-        # Remove the used transcription suggestions so they don't reappear
-        for value in selected_transcriptions:
-            if value in possible_values:
-                possible_values.remove(value)
-        # Prepare the next set of suggestions
-        next_options = [{"label": t, "value": t} for t in possible_values[:6] + ["autre transcription"]]
-        # Build a confirmation message
-        confirmation_message = (
-            f"Transcriptions sélectionnées : {', '.join(selected_transcriptions)}"
-            if selected_transcriptions else "Aucune transcription sélectionnée."
-        )
-        # IMPORTANT: Return an empty list for the checklist's value so that it is reset
-        return next_audio, next_options, [], confirmation_message, stored_transcriptions
-    return no_update, no_update, no_update, no_update, no_update
 @app.callback(
     Output("confirmation-message", "children", allow_duplicate=True),
     Input("save-results-button", "n_clicks"),
@@ -163,21 +192,21 @@ def update_transcription(n_clicks, selected_transcriptions, user_info, page_valu
     prevent_initial_call=True
 )
 def save_results(n_clicks, page_value, stored_transcriptions):
     if n_clicks > 0 and page_value:
         try:
-            initial_transcriptions = load_persistent_data(PERSIST_FILE)
         except Exception as e:
-            print(f"No existing transcription found or error downloading: {e}")
             initial_transcriptions = []
-        combined_transcriptions = initial_transcriptions + stored_transcriptions
-        save_persistent_data(combined_transcriptions, PERSIST_FILE)
-        page_value=page_value.replace('\\', '/').replace("assets/", "")
-        s3_key = f"labelling/{page_value}/{PERSIST_FILE}"
-        upload_file_to_s3(PERSIST_FILE, BUCKET_NAME, s3_key)
-        # Confirmation message
-        confirmation_message = "Les résultats ont été sauvegardés avec succès."
-        return confirmation_message
-    return no_update

+"""
+callbacks.py
+-------------
+Ce module définit les callbacks de l'application Dash en utilisant les abstractions
+définies dans le sous-package helpers."""
 from dash import Input, Output, State, no_update
 import datetime
 import os
 from pathlib import Path
+from loguru import logger
+from dotenv import load_dotenv
+load_dotenv()
+from helpers.processor import Processor
+from helpers.s3 import S3Client
+from helpers.models import S3Config
+from app import app
+from global_vars import data, BUCKET_NAME
 PERSIST_FILE = "results.json"
+# Instanciation du Processor
+processor = Processor()
+# Instanciation du client S3 à partir de la configuration
+s3_config = S3Config(
+    bucket_name=BUCKET_NAME,
+    endpoint_url=os.getenv("AWS_ENDPOINT_URL_S3"),
+    access_key=os.getenv("AWS_ACCESS_KEY_ID"),
+    secret_key=os.getenv("AWS_SECRET_ACCESS_KEY")
+)
+s3_client = S3Client(s3_config)
+# -----------------------------------------------------------------------------
+# CALLBACKS D'AFFICHAGE DES SECTIONS
+# -----------------------------------------------------------------------------
 @app.callback(
     Output("chapter-section", "style"),
     Output("pseudo-continue-button", "style"),
     State("user-info", "value")
 )
 def show_chapter_section(n_clicks, user_info):
+    """Affiche la section chapitre après saisie d'une information utilisateur."""
     if n_clicks and user_info:
         return {"display": "block"}, {"display": "none"}
     return {"display": "none"}, {"display": "block"}
 @app.callback(
     Output("page-section", "style"),
     Output("chapter-continue-button", "style"),
     State("chapter-dropdown", "value")
 )
 def show_page_section(n_clicks, chapter_value):
+    """Affiche la section page après sélection d'un chapitre."""
     if n_clicks and chapter_value:
         return {"display": "block"}, {"display": "none"}
     return {"display": "none"}, {"display": "block"}
 @app.callback(
     Output("transcription-section", "style"),
     Output("start-button", "style"),
     State("page-dropdown", "value")
 )
 def show_transcription_section(n_clicks, page_value):
+    """Affiche la section transcription après sélection d'une page."""
     if n_clicks and page_value:
         return {"display": "block"}, {"display": "none"}
     return {"display": "none"}, {"display": "block"}
+# -----------------------------------------------------------------------------
+# CALLBACK DE MISE À JOUR DU DROPDOWN DES PAGES
+# -----------------------------------------------------------------------------
 @app.callback(
     Output("page-dropdown", "options"),
     Input("chapter-dropdown", "value")
 )
 def update_pages(chapter_value):
+    """Met à jour les options du dropdown de pages selon le chapitre sélectionné."""
     if chapter_value:
         chapter_path = Path(chapter_value)
+        pages = [d for d in chapter_path.iterdir() if d.is_dir()] if chapter_path.exists() else []
+        return [{"label": d.name, "value": str(d)} for d in pages]
     return []
+# -----------------------------------------------------------------------------
+# CALLBACK DE MISE À JOUR AUDIO ET DES SUGGESTIONS (via dcc.Store)
+# -----------------------------------------------------------------------------
 @app.callback(
+    Output("audio-store", "data"),
+    Output("values-store", "data"),
     Output("audio-player", "src"),
     Output("suggestion-checklist", "options"),
+    Output("hidden-message", "style"),
     Input("page-dropdown", "value"),
     State("chapter-dropdown", "value")
 )
 def update_audio_and_suggestions(page_value, chapter_value):
+    """
+    Met à jour les stores pour les chemins audio et les suggestions.
+    Affiche le premier segment audio et les 6 premières suggestions.
+    """
+    hidden_style = {"display": "none"}
     if page_value and chapter_value:
+        # Utilise la méthode abstraite pour charger les transcriptions et extraire l'état
+        possible_values, audio_paths = processor.load_page_verses_and_audios(s3_client, page_value, data)
+        options = [{"label": t, "value": t} for t in possible_values[:6]]
+        audio_src = audio_paths[0] if audio_paths else no_update
+        return audio_paths, possible_values, audio_src, options, hidden_style
+    else:
+        if os.path.exists(PERSIST_FILE):
             os.remove(PERSIST_FILE)
+        return no_update, no_update, no_update, no_update, {"display": "block"}
+# -----------------------------------------------------------------------------
+# CALLBACK POUR LE TRAITEMENT DE LA TRANSCIPTION
+# -----------------------------------------------------------------------------
 @app.callback(
+    Output("audio-store", "data", allow_duplicate=True),
+    Output("values-store", "data", allow_duplicate=True),
     Output("audio-player", "src", allow_duplicate=True),
     Output("suggestion-checklist", "options", allow_duplicate=True),
+    Output("suggestion-checklist", "value", allow_duplicate=True),
     Output("confirmation-message", "children"),
     Output("transcription-store", "data"),
     Input("submit-button", "n_clicks"),
     State("user-info", "value"),
     State("page-dropdown", "value"),
     State("audio-player", "src"),
+    State("audio-store", "data"),
+    State("values-store", "data"),
     State("transcription-store", "data"),
     prevent_initial_call=True
 )
+def update_transcription(n_clicks, selected_transcriptions, user_info, page_value,
+                         current_audio, audio_store, values_store, stored_transcriptions):
+    """
+    Traite la soumission d'une transcription :
+      - Ajoute l'entrée avec timestamp dans le store de transcription.
+      - Retire le segment audio traité et les suggestions utilisées.
+      - Met à jour l'audio et les options de la checklist.
+    """
     if n_clicks > 0 and page_value and current_audio:
         timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        stored_transcriptions = stored_transcriptions if stored_transcriptions is not None else []
+        stored_transcriptions.append({
+            "segment_path": current_audio,
             "transcriptions": selected_transcriptions,
             "timestamp": timestamp,
             "user_id": user_info
+        })
+        # Mise à jour du store audio
+        if audio_store and isinstance(audio_store, list):
+            audio_store.pop(0)
+            next_audio = audio_store[0] if audio_store else ""
         else:
             next_audio = ""
+        # Mise à jour du store de suggestions
+        if values_store and isinstance(values_store, list):
+            for val in selected_transcriptions:
+                if val in values_store:
+                    values_store.remove(val)
+        next_options = [{"label": t, "value": t} for t in (values_store[:6] + ["autre transcription"])] if values_store else []
+        confirmation_message = (f"Transcriptions sélectionnées : {', '.join(selected_transcriptions)}"
+                                if selected_transcriptions else "Aucune transcription sélectionnée.")
+        # Réinitialisation de la checklist
+        if len(next_options)>1:
+            return audio_store, values_store, next_audio, next_options, [], confirmation_message, stored_transcriptions
+    return no_update, no_update, no_update, no_update, no_update, no_update, no_update
+# -----------------------------------------------------------------------------
+# CALLBACK POUR LA SAUVEGARDE DES RÉSULTATS
+# -----------------------------------------------------------------------------
 @app.callback(
     Output("confirmation-message", "children", allow_duplicate=True),
     Input("save-results-button", "n_clicks"),
     prevent_initial_call=True
 )
 def save_results(n_clicks, page_value, stored_transcriptions):
+    """
+    Sauvegarde les transcriptions en combinant les données persistantes existantes
+    avec les nouvelles et en les uploadant sur S3.
+    """
     if n_clicks > 0 and page_value:
         try:
+            initial_transcriptions = processor.load_persistent_data(PERSIST_FILE)
         except Exception as e:
+            logger.error(f"Erreur lors du chargement des données persistantes : {e}")
             initial_transcriptions = []
+        combined_transcriptions = initial_transcriptions + (stored_transcriptions if stored_transcriptions else [])
+        processor.save_persistent_data(combined_transcriptions, PERSIST_FILE)
+        cleaned_page = page_value.replace("\\", "/").replace("assets/", "")
+        s3_key = f"labelling/{cleaned_page}/{PERSIST_FILE}"
+        s3_client.upload_file(PERSIST_FILE, s3_key)
+        return "Les résultats ont été sauvegardés avec succès."
+    return no_update

app/pages/Annotations/layout.py CHANGED Viewed

@@ -1,237 +1,224 @@
-from pathlib import Path
 import dash_bootstrap_components as dbc
 from dash import dcc, html
-from global_vars import audio_paths, possible_values
 from dotenv import load_dotenv
 load_dotenv()
 BUCKET_NAME = "moore-collection"
-# Configuration
-AUDIO_FOLDER = Path("./assets/assets")
-PERSIST_FILE = "results.json"
-# Helper functions
 def get_chapter_paths(base_folder):
-    """Retrieve all chapter folders in the base folder."""
     base = Path(base_folder)
     return [d for d in base.iterdir() if d.is_dir()]
 chapters = get_chapter_paths("assets")
 chapter_options = [{"label": d.name, "value": str(d)} for d in chapters]
-# Layout
-def create_layout(audio_paths, possible_values):
-    """Create and return the main layout of the application."""
-    return dbc.Container(
-        [
-            dbc.Row(
-                [
-                    dbc.Col(
-                        html.H1(
-                            "Outil de transcription audio",
-                            className="text-center my-4 text-primary",
-                        ),
-                        width=12,
-                    )
-                ]
             ),
-            # Step 1: Enter user info
-            dbc.Row(
-                [
-                    dbc.Col(
-                        [
-                            dbc.Input(
-                                id="user-info",
-                                placeholder="Entrez votre email, pseudonyme ou nom pour qu'on vous crédite",
-                                type="text",
-                                className="mb-3",
-                            ),
-                            dbc.Button(
-                                "Continuer",
-                                id="pseudo-continue-button",
-                                color="primary",
-                                className="w-100",
-                            ),
-                        ],
-                        width=12,
-                    )
-                ]
-            ),
-            # Step 2: Chapter selection (initially hidden)
-            dbc.Row(
-                id="chapter-section",
-                style={"display": "none"},
-                children=[
-                    dbc.Col(
-                        [
-                            dbc.Card(
-                                [
-                                    dbc.CardHeader("Sélectionnez un chapitre"),
-                                    dbc.CardBody(
-                                        [
-                                            dcc.Dropdown(
-                                                id="chapter-dropdown",
-                                                options=chapter_options,
-                                                placeholder="Choisissez un chapitre",
-                                            ),
-                                            dbc.Button(
-                                                "Continuer",
-                                                id="chapter-continue-button",
-                                                color="primary",
-                                                className="w-100 mt-2",
-                                            ),
-                                        ]
-                                    ),
-                                ]
-                            )
-                        ],
-                        width=12,
                     )
-                ],
             ),
-            # Step 3: Page selection (initially hidden)
-            dbc.Row(
-                id="page-section",
-                style={"display": "none"},
-                children=[
-                    dbc.Col(
-                        [
-                            dbc.Card(
-                                [
-                                    dbc.CardHeader("Sélectionnez une page"),
-                                    dbc.CardBody(
-                                        [
-                                            dcc.Dropdown(
-                                                id="page-dropdown",
-                                                placeholder="Choisissez une page",
-                                            ),
-                                            dbc.Button(
-                                                "Démarrer la transcription",
-                                                id="start-button",
-                                                color="primary",
-                                                className="w-100 mt-2",
-                                            ),
-                                        ]
-                                    ),
-                                ]
-                            )
-                        ],
-                        width=12,
-                    )
-                ],
             ),
-            # Step 4: Transcription section (initially hidden)
-            dbc.Row(
-                id="transcription-section",
-                style={"display": "none"},
-                children=[
-                    dbc.Col(
-                        [
-                            dbc.Card(
-                                [
-                                    dbc.CardHeader("Lecture audio"),
-                                    dbc.CardBody(
-                                        [
-                                            dcc.Loading(
-                                                html.Audio(
-                                                    id="audio-player",
-                                                    controls=True,
-                                                    autoPlay=False,
-                                                    className="w-100",
-                                                )
-                                            )
-                                        ]
-                                    ),
-                                ],
-                                className="mb-4 shadow",
-                            )
-                        ],
-                        width=12,
-                    ),
-                    dbc.Col(
-                        [
-                            dbc.Card(
-                                [
-                                    dbc.CardHeader("Suggestions de transcriptions"),
-                                    dbc.CardBody(
-                                        [
-                                            dcc.Checklist(
-                                                id="suggestion-checklist",
-                                                options=[
-                                                    {"label": t, "value": t}
-                                                    for t in possible_values[:6]
-                                                ],
-                                                value=[],
-                                                style={
-                                                    "columns": "3",
-                                                    "column-gap": "1rem",
-                                                },
-                                            )
-                                        ]
-                                    ),
-                                ],
-                                className="mb-4 shadow",
-                            )
-                        ],
-                        width=12,
-                    ),
-                     dbc.Col([
-            html.Div(
-                id="hidden-message",
-                style={"display": "none"},  # Initially hidden
-                children=[
-                    html.P("Traitement de la page actuelle terminée, vous devez changer de page pour continuer. N'oubliez pas de sauvegarder.",  style={"color": "red"})
-                ]
             )
-        ], width=12),
-                    dbc.Col(
-                        [
-                            dbc.Button(
-                                "Soumettre",
-                                id="submit-button",
-                                n_clicks=0,
-                                color="secondary",
-                                className="w-100",
-                                style={"marginTop": "20px"},
-                            ),
-                            dbc.Button(
-                                "Sauvegarder résultats",
-                                id="save-results-button",
-                                n_clicks=0,
-                                color="success",
-                                className="w-100",
-                                style={"marginTop": "20px"},
-                            ),
-                        ],
-                        width=12,
-                    ),
-                    dbc.Col(
-                        [
-                            html.Div(
-                                id="confirmation-message",
-                                className="text-success text-center mt-3",
-                            )
-                        ],
-                        width=12,
-                    ),
-                    dbc.Row(
-                        [
-                            dcc.Store(
-                                id="transcription-store", data=[]
-                            ),  # Store for temporary transcriptions
-                        ]
-                    ),
-                ],
             ),
-        ]
     )
-layout = create_layout(audio_paths, possible_values)
-from .callbacks import *

 import dash_bootstrap_components as dbc
 from dash import dcc, html
+from pathlib import Path
 from dotenv import load_dotenv
 load_dotenv()
 BUCKET_NAME = "moore-collection"
+# --- Helper pour récupérer les chapitres ---
 def get_chapter_paths(base_folder):
     base = Path(base_folder)
     return [d for d in base.iterdir() if d.is_dir()]
 chapters = get_chapter_paths("assets")
 chapter_options = [{"label": d.name, "value": str(d)} for d in chapters]
+# --- Fonctions de création des différentes cards ---
+def header_card():
+    """Carte d'en-tête avec le titre de l'application."""
+    return dbc.Row(
+        dbc.Col(
+            html.H1(
+                "Outil de transcription audio",
+                className="text-center my-4 text-primary",
             ),
+            width=12,
+        )
+    )
+def user_info_card():
+    """Carte pour la saisie des informations utilisateur."""
+    return dbc.Row(
+        dbc.Col(
+            [
+                dbc.Input(
+                    id="user-info",
+                    placeholder="Entrez votre email, pseudonyme ou nom pour qu'on vous crédite",
+                    type="text",
+                    className="mb-3",
+                ),
+                dbc.Button(
+                    "Continuer",
+                    id="pseudo-continue-button",
+                    color="primary",
+                    className="w-100",
+                ),
+            ],
+            width=12,
+        )
+    )
+def chapter_card():
+    """Carte pour la sélection d'un chapitre."""
+    return dbc.Row(
+        id="chapter-section",
+        style={"display": "none"},
+        children=[
+            dbc.Col(
+                dbc.Card(
+                    [
+                        dbc.CardHeader("Sélectionnez un chapitre"),
+                        dbc.CardBody(
+                            [
+                                dcc.Dropdown(
+                                    id="chapter-dropdown",
+                                    options=chapter_options,
+                                    placeholder="Choisissez un chapitre",
+                                ),
+                                dbc.Button(
+                                    "Continuer",
+                                    id="chapter-continue-button",
+                                    color="primary",
+                                    className="w-100 mt-2",
+                                ),
+                            ]
+                        ),
+                    ]
+                ),
+                width=12,
+            )
+        ],
+    )
+def page_card():
+    """Carte pour la sélection d'une page."""
+    return dbc.Row(
+        id="page-section",
+        style={"display": "none"},
+        children=[
+            dbc.Col(
+                dbc.Card(
+                    [
+                        dbc.CardHeader("Sélectionnez une page"),
+                        dbc.CardBody(
+                            [
+                                dcc.Dropdown(
+                                    id="page-dropdown",
+                                    placeholder="Choisissez une page",
+                                ),
+                                dbc.Button(
+                                    "Démarrer la transcription",
+                                    id="start-button",
+                                    color="primary",
+                                    className="w-100 mt-2",
+                                ),
+                            ]
+                        ),
+                    ]
+                ),
+                width=12,
+            )
+        ],
+    )
+def transcription_card():
+    """Carte regroupant la lecture audio, les suggestions et les actions de transcription."""
+    audio_card = dbc.Card(
+        [
+            dbc.CardHeader("Lecture audio"),
+            dbc.CardBody(
+                dcc.Loading(
+                    html.Audio(
+                        id="audio-player",
+                        controls=True,
+                        autoPlay=False,
+                        className="w-100",
                     )
+                )
             ),
+        ],
+        className="mb-4 shadow",
+    )
+    suggestion_card = dbc.Card(
+        [
+            dbc.CardHeader("Suggestions de transcriptions"),
+            dbc.CardBody(
+                dcc.Checklist(
+                    id="suggestion-checklist",
+                    options=[],  # Initialement vide, sera mis à jour via callback
+                    value=[],
+                    style={"columns": "3", "column-gap": "1rem"},
+                )
             ),
+        ],
+        className="mb-4 shadow",
+    )
+    hidden_message = html.Div(
+        id="hidden-message",
+        style={"display": "none"},
+        children=[
+            html.P(
+                "Traitement de la page actuelle terminé, vous devez changer de page pour continuer. N'oubliez pas de sauvegarder.",
+                style={"color": "red"},
             )
+        ],
+    )
+    action_buttons = dbc.Col(
+        [
+            dbc.Button(
+                "Soumettre",
+                id="submit-button",
+                n_clicks=0,
+                color="secondary",
+                className="w-100",
+                style={"marginTop": "20px"},
             ),
+            dbc.Button(
+                "Sauvegarder résultats",
+                id="save-results-button",
+                n_clicks=0,
+                color="success",
+                className="w-100",
+                style={"marginTop": "20px"},
+            ),
+        ],
+        width=12,
+    )
+    confirmation_message = dbc.Col(
+        html.Div(
+            id="confirmation-message",
+            className="text-success text-center mt-3",
+        ),
+        width=12,
     )
+    # La carte de transcription regroupe plusieurs composants et dcc.Store pour l'état
+    return dbc.Row(
+        id="transcription-section",
+        style={"display": "none"},
+        children=[
+            dbc.Col(audio_card, width=12),
+            dbc.Col(suggestion_card, width=12),
+            dbc.Col(hidden_message, width=12),
+            action_buttons,
+            confirmation_message,
+            # Stores pour l'état de l'application
+            dcc.Store(id="transcription-store", data=[]),
+            dcc.Store(id="audio-store", data=[]),
+            dcc.Store(id="values-store", data=[]),
+        ],
+    )
+def create_layout():
+    """Compose le layout principal à partir des différentes cards."""
+    return dbc.Container(
+        [
+            header_card(),
+            user_info_card(),
+            chapter_card(),
+            page_card(),
+            transcription_card(),
+        ],
+        fluid=True,
+        className="p-4",
+    )
+# Initialisation du layout
+layout = create_layout()
+from .callbacks import *

app/pages/Annotations/state.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from dataclasses import dataclass
+from typing import List, Optional
+import pandas as pd
+@dataclass
+class AppState:
+    audio_paths: List[str] = None
+    possible_values: List[str] = None
+    current_chapter: Optional[str] = None
+    current_page: Optional[str] = None
+    verses_data: Optional[pd.DataFrame] = None
+    @classmethod
+    def initialize(cls, dataset_name: str):
+        from datasets import load_dataset
+        data = load_dataset(dataset_name, split="train").to_pandas()
+        data[["chapter", "page"]] = data["moore_source_url"].apply(
+            lambda x: pd.Series(extract_audio_identifier(x))
+        )
+        return cls(verses_data=data)

app/pages/base_page.py CHANGED Viewed

@@ -17,7 +17,7 @@ class PageMeta:
             parts = self.module_name.split(".")
             if len(parts) < 2:
                 raise ValueError(f"Invalid module name: {self.module_name}")
-            self._id = parts[-1]  # Use the last part as the ID
         return self._id
     @property
@@ -29,7 +29,7 @@ class PageMeta:
     @property
     def path(self) -> str:
         if not self._path:
-            self._path = route_prefix + "/" + self.id_
         return self._path
     @classmethod

             parts = self.module_name.split(".")
             if len(parts) < 2:
                 raise ValueError(f"Invalid module name: {self.module_name}")
+            self._id = parts[1]
         return self._id
     @property
     @property
     def path(self) -> str:
         if not self._path:
+            self._path = route_prefix  + self.id_
         return self._path
     @classmethod