Spaces:

ch-outcomes-ai
/

Evaluate_ASR

Sleeping

App Files Files Community

chenhaodev commited on Feb 12

Commit

ca430b9

1 Parent(s): 377e4e1

init

Browse files

Files changed (5) hide show

Dockerfile +47 -0
Example.jpg +0 -0
README.md +111 -6
app.py +333 -0
requirements.txt +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,47 @@

+# Use Ubuntu as base image
+FROM ubuntu:22.04
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    curl \
+    wget \
+    git \
+    net-tools \
+    && rm -rf /var/lib/apt/lists/*
+# Install Ollama
+RUN curl -fsSL https://ollama.com/install.sh | sh
+# Set working directory
+WORKDIR /app
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create startup script
+RUN echo '#!/bin/bash\n\
+# Start Ollama server\n\
+ollama serve & \n\
+sleep 5\n\
+\n\
+# Pull the model if not exists\n\
+ollama pull deepseek-r1:1.5b\n\
+\n\
+# Start the Gradio app\n\
+exec python3 -u app.py\n\
+' > start.sh && chmod +x start.sh
+# Expose port for Gradio web interface
+EXPOSE 7860
+# Run the application
+ENTRYPOINT ["./start.sh"]

Example.jpg ADDED Viewed

README.md CHANGED Viewed

@@ -1,11 +1,116 @@
 ---
-title: Evaluate ASR
-emoji: 🏃
 colorFrom: blue
-colorTo: green
-sdk: docker
 pinned: false
-short_description: version 1.1
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ASR Evaluation Tool
+emoji: 🎯
 colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 5.16.0
+app_file: app.py
 pinned: false
 ---
+# ASR Evaluation Tool (Ver 1.1)
+This Gradio app provides a user-friendly interface for calculating Word Error Rate (WER) and related metrics between reference and hypothesis texts. It's particularly useful for evaluating speech recognition or machine translation outputs.
+## Features
+- Calculate WER, MER, WIL, and WIP metrics
+- Text normalization options
+- Custom word filtering
+- Detailed error analysis
+- Example inputs for testing
+## How to Use
+1. Enter or paste your reference text
+2. Enter or paste your hypothesis text
+3. Configure options (normalization, word filtering)
+4. Click "Calculate WER" to see results
+NOTE: There might be a 30-second delay due to the r1:1.5B model being called for medical term recall calculations.
+![Example](./Example.jpg)
+## Local Development
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/wer-evaluation-tool.git
+cd wer-evaluation-tool
+```
+2. Create and activate a virtual environment using `uv`:
+```bash
+uv venv
+source .venv/bin/activate  # On Unix/macOS
+# or
+.venv\Scripts\activate  # On Windows
+```
+3. Install dependencies:
+```bash
+uv pip install -r requirements.txt
+```
+4. Run the app locally:
+```bash
+uv run python app_gradio.py
+```
+## Installation
+You can install the package directly from PyPI:
+```bash
+uv pip install wer-evaluation-tool
+```
+## Testing
+Run the test suite using pytest:
+```bash
+uv run pytest tests/
+```
+## Contributing
+1. Fork the repository
+2. Create a new branch (`git checkout -b feature/improvement`)
+3. Make your changes
+4. Run tests to ensure everything works
+5. Commit your changes (`git commit -am 'Add new feature'`)
+6. Push to the branch (`git push origin feature/improvement`)
+7. Create a Pull Request
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- Thanks to all contributors who have helped with the development
+- Inspired by the need for better speech recognition evaluation tools
+- Built with [Gradio](https://gradio.app/)
+## Contact
+For questions or feedback, please:
+- Open an issue in the GitHub repository
+- Contact the maintainers at [email/contact information]
+## Citation
+If you use this tool in your research, please cite:
+```bibtex
+@software{wer_evaluation_tool,
+  title = {WER Evaluation Tool},
+  author = {Your Name},
+  year = {2024},
+  url = {https://github.com/yourusername/wer-evaluation-tool}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import gradio as gr
+import jiwer
+import pandas as pd
+import logging
+from typing import List, Optional, Tuple, Dict
+from ollama import Client
+import re
+import os
+# Set up logging configuration
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    force=True,
+    handlers=[
+        logging.StreamHandler(),
+    ]
+)
+logger = logging.getLogger(__name__)
+def calculate_wer_metrics(
+    hypothesis: str,
+    reference: str,
+    normalize: bool = True,
+    words_to_filter: Optional[List[str]] = None
+) -> Dict:
+    """
+    Calculate WER metrics between hypothesis and reference texts.
+    Args:
+        hypothesis (str): The hypothesis text
+        reference (str): The reference text
+        normalize (bool): Whether to normalize texts before comparison
+        words_to_filter (List[str], optional): Words to filter out before comparison
+    Returns:
+        dict: Dictionary containing WER metrics
+    Raises:
+        ValueError: If inputs are invalid or result in empty text after processing
+    """
+    logger.info(f"Calculating WER metrics with inputs - Hypothesis: {hypothesis}, Reference: {reference}")
+    # Validate inputs
+    if not hypothesis.strip() or not reference.strip():
+        raise ValueError("Both hypothesis and reference texts must contain non-empty strings")
+    if normalize:
+        # Define basic transformations
+        basic_transform = jiwer.Compose([
+            jiwer.ExpandCommonEnglishContractions(),
+            jiwer.ToLowerCase(),
+            jiwer.RemoveMultipleSpaces(),
+            jiwer.RemovePunctuation(),
+            jiwer.Strip(),
+            jiwer.ReduceToListOfListOfWords()
+        ])
+        if words_to_filter and any(words_to_filter):
+            def filter_words_transform(words: List[str]) -> List[str]:
+                filtered = [word for word in words
+                          if word.lower() not in [w.lower() for w in words_to_filter]]
+                if not filtered:
+                    raise ValueError("Text is empty after filtering words")
+                return filtered
+            transformation = jiwer.Compose([
+                basic_transform,
+                filter_words_transform
+            ])
+        else:
+            transformation = basic_transform
+        # Pre-check the transformed text
+        try:
+            transformed_ref = transformation(reference)
+            transformed_hyp = transformation(hypothesis)
+            if not transformed_ref or not transformed_hyp:
+                raise ValueError("Text is empty after normalization")
+            logger.debug(f"Transformed reference: {transformed_ref}")
+            logger.debug(f"Transformed hypothesis: {transformed_hyp}")
+        except Exception as e:
+            logger.error(f"Transformation error: {str(e)}")
+            raise ValueError(f"Error during text transformation: {str(e)}")
+        measures = jiwer.compute_measures(
+            truth=reference,
+            hypothesis=hypothesis,
+            truth_transform=transformation,
+            hypothesis_transform=transformation
+        )
+    else:
+        measures = jiwer.compute_measures(
+            truth=reference,
+            hypothesis=hypothesis
+        )
+    return measures
+# Initialize Ollama client
+client = Client(host='http://localhost:11434')
+def extract_medical_terms(text: str) -> List[str]:
+    """
+    Extract medical terms from text using Qwen model via Ollama.
+    Args:
+        text (str): Input text
+    Returns:
+        List[str]: List of extracted medical terms
+    """
+    prompt = f"""Extract all medical terms from the following text.
+    Return only the medical terms as a comma-separated list.
+    Text: {text}"""
+    try:
+        response = client.generate(
+            model='deepseek-r1:1.5b',
+            prompt=prompt,
+            stream=False
+        )
+        # Get the response text
+        response_text = response['response']
+        # Remove the thinking process
+        if '<think>' in response_text and '</think>' in response_text:
+            # Extract everything after </think>
+            medical_terms_text = response_text.split('</think>')[-1].strip()
+        else:
+            medical_terms_text = response_text
+        # Parse the comma-separated response
+        medical_terms = [term.strip() for term in medical_terms_text.split(',')]
+        # Remove empty terms and clean up
+        return [term for term in medical_terms if term and not term.startswith('<') and not term.endswith('>')]
+    except Exception as e:
+        logger.error(f"Error in medical term extraction: {str(e)}")
+        return []
+def calculate_medical_recall(
+    hypothesis_terms: List[str],
+    reference_terms: List[str]
+) -> float:
+    """
+    Calculate medical term recall rate.
+    Args:
+        hypothesis_terms (List[str]): Medical terms from hypothesis
+        reference_terms (List[str]): Medical terms from reference
+    Returns:
+        float: Recall rate
+    """
+    if not reference_terms:
+        return 1.0 if not hypothesis_terms else 0.0
+    correct_terms = set(hypothesis_terms) & set(reference_terms)
+    return len(correct_terms) / len(set(reference_terms))
+def process_inputs(
+    reference: str,
+    hypothesis: str,
+    normalize: bool,
+    words_to_filter: str
+) -> Tuple[str, str, str, str]:
+    """
+    Process inputs and calculate both WER and medical term recall metrics.
+    Args:
+        reference (str): Reference text
+        hypothesis (str): Hypothesis text
+        normalize (bool): Whether to normalize text
+        words_to_filter (str): Comma-separated words to filter
+    Returns:
+        Tuple[str, str, str, str]: HTML formatted main metrics, error analysis,
+                                  and explanations
+    """
+    if not reference or not hypothesis:
+        return "Please provide both reference and hypothesis texts.", "", "", ""
+    try:
+        # Extract medical terms
+        reference_terms = extract_medical_terms(reference)
+        hypothesis_terms = extract_medical_terms(hypothesis)
+        # Calculate medical recall
+        med_recall = calculate_medical_recall(hypothesis_terms, reference_terms)
+        # Calculate WER metrics
+        filter_words = [word.strip() for word in words_to_filter.split(",")] if words_to_filter else None
+        measures = calculate_wer_metrics(
+            hypothesis=hypothesis,
+            reference=reference,
+            normalize=normalize,
+            words_to_filter=filter_words
+        )
+        # Format metrics
+        metrics_df = pd.DataFrame({
+            'Metric': ['WER', 'MER', 'WIL', 'WIP', 'Medical Term Recall'],
+            'Value': [
+                f"{measures['wer']:.3f}",
+                f"{measures['mer']:.3f}",
+                f"{measures['wil']:.3f}",
+                f"{measures['wip']:.3f}",
+                f"{med_recall:.3f}"
+            ]
+        })
+        # Format error analysis
+        error_df = pd.DataFrame({
+            'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
+            'Count': [
+                measures['substitutions'],
+                measures['deletions'],
+                measures['insertions'],
+                measures['hits']
+            ]
+        })
+        # Format medical terms comparison
+        med_terms_df = pd.DataFrame({
+            'Source': ['Reference', 'Hypothesis'],
+            'Medical Terms': [
+                ', '.join(reference_terms),
+                ', '.join(hypothesis_terms)
+            ]
+        })
+        metrics_html = metrics_df.to_html(index=False)
+        error_html = error_df.to_html(index=False)
+        med_terms_html = med_terms_df.to_html(index=False)
+        explanation = f"""
+        <h3>Metrics Explanation:</h3>
+        <ul>
+            <li><b>WER (Word Error Rate)</b>: The percentage of words that were incorrectly predicted</li>
+            <li><b>MER (Match Error Rate)</b>: The percentage of words that were incorrectly matched</li>
+            <li><b>WIL (Word Information Lost)</b>: The percentage of word information that was lost</li>
+            <li><b>WIP (Word Information Preserved)</b>: The percentage of word information that was preserved</li>
+            <li><b>Medical Term Recall</b>: The proportion of reference medical terms that were correctly identified in the hypothesis</li>
+        </ul>
+        <h3>Extracted Medical Terms:</h3>
+        {med_terms_html}
+        """
+        return metrics_html, error_html, explanation, ""
+    except Exception as e:
+        error_msg = f"Error in processing: {str(e)}"
+        logger.error(error_msg)
+        return "", "", "", error_msg
+def load_example() -> Tuple[str, str]:
+    """Load example texts for demonstration."""
+    return (
+        "The patient shows signs of heart attack and hypertension.",
+        "The patient shows signs of heart attack and high blood pressure."
+    )
+def create_interface() -> gr.Blocks:
+    """Create the Gradio interface."""
+    with gr.Blocks(title="WER Evaluation Tool") as interface:
+        gr.Markdown("# Word Error Rate (WER) Evaluation Tool")
+        gr.Markdown(
+            "This tool helps you evaluate the Word Error Rate (WER) between a reference "
+            "text and a hypothesis text. WER is commonly used in speech recognition and "
+            "machine translation evaluation."
+        )
+        with gr.Row():
+            with gr.Column():
+                reference = gr.Textbox(
+                    label="Reference Text",
+                    placeholder="Enter the reference text here...",
+                    lines=5
+                )
+            with gr.Column():
+                hypothesis = gr.Textbox(
+                    label="Hypothesis Text",
+                    placeholder="Enter the hypothesis text here...",
+                    lines=5
+                )
+        with gr.Row():
+            normalize = gr.Checkbox(
+                label="Normalize text (lowercase, remove punctuation)",
+                value=True
+            )
+            words_to_filter = gr.Textbox(
+                label="Words to filter (comma-separated)",
+                placeholder="e.g., um, uh, ah"
+            )
+        with gr.Row():
+            example_btn = gr.Button("Load Example")
+            calculate_btn = gr.Button("Calculate WER", variant="primary")
+        with gr.Row():
+            metrics_output = gr.HTML(label="Main Metrics")
+            error_output = gr.HTML(label="Error Analysis")
+        explanation_output = gr.HTML()
+        error_msg_output = gr.HTML()
+        # Event handlers
+        example_btn.click(
+            load_example,
+            outputs=[reference, hypothesis]
+        )
+        calculate_btn.click(
+            process_inputs,
+            inputs=[reference, hypothesis, normalize, words_to_filter],
+            outputs=[metrics_output, error_output, explanation_output, error_msg_output]
+        )
+    return interface
+if __name__ == "__main__":
+    logger.info("Application started")
+    app = create_interface()
+    # Explicitly configure Gradio to be accessible from outside the container
+    app.launch(
+        server_name="0.0.0.0",  # Bind to all interfaces
+        server_port=7860,
+        share=False,  # Don't create a public URL
+        debug=True    # Enable debug mode for more information
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==5.16.0
+jiwer==3.1.0
+pandas==2.2.0
+ollama==0.4.5