|
from dataclasses import dataclass |
|
from enum import Enum |
|
|
|
@dataclass |
|
class Task: |
|
benchmark: str |
|
metric: str |
|
col_name: str |
|
|
|
|
|
|
|
|
|
class Tasks(Enum): |
|
|
|
task0 = Task("anli_r1", "acc", "ANLI") |
|
task1 = Task("logiqa", "acc_norm", "LogiQA") |
|
|
|
NUM_FEWSHOT = 0 |
|
|
|
|
|
|
|
|
|
|
|
TITLE = """<h1 align="center" id="space-title">Hebrew Speech Recognition Leaderboard</h1>""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
Welcome to the Hebrew Speech Recognition Leaderboard! This is a community-driven effort to track and compare the performance |
|
of various speech recognition models on Hebrew language tasks. |
|
|
|
This leaderboard is maintained by [ivrit.ai](https://ivrit.ai), a project dedicated to advancing Hebrew language AI technologies. |
|
You can find our work on [GitHub](https://github.com/ivrit-ai) and [Hugging Face](https://huggingface.co/ivrit-ai). |
|
|
|
## Motivation |
|
Hebrew presents unique challenges for speech recognition due to its rich morphology, absence of written vowels, and diverse |
|
dialectal variations. This leaderboard aims to: |
|
- Provide standardized benchmarks for Hebrew ASR evaluation |
|
- Track progress in Hebrew speech recognition technology |
|
- Foster collaboration in the Hebrew NLP community |
|
- Make Hebrew speech technology more accessible |
|
|
|
## Benchmarks |
|
The following datasets are used in our evaluation: |
|
|
|
### [ivrit-ai/eval-d1](https://huggingface.co/datasets/ivrit-ai/eval-d1) |
|
- **Size**: 2 hours |
|
- **Domain**: Manual transcription of a single podcast episode featuring an informal conversation between two speakers (male and female). Audio is segmented into approximately 5-minute chunks. |
|
- **Source**: Part of the ivrit.ai corpus. Selected episode has been manually transcribed to golden standard quality to serve as a high-quality evaluation benchmark. |
|
|
|
### [ivrit-ai/saspeech](https://huggingface.co/datasets/ivrit-ai/saspeech) |
|
- **Size**: 4 hours (manually corrected portion of the corpus) |
|
- **Domain**: Economic and political podcast content, containing both read speech and conversational segments. Segments are several seconds in length. |
|
- **Source**: Derived from the [Robo-Shaul project](https://www.roboshaul.com/) and published in the paper |
|
"SASPEECH: A Hebrew Single Speaker Dataset for Text To Speech and Voice Conversion" (Sharoni, O., Shenberg, R., Cooper, E. (2023) SASPEECH: A Hebrew Single Speaker Dataset for Text To Speech and Voice Conversion. Proc. INTERSPEECH 2023,) |
|
|
|
### [google/fleurs/he](https://huggingface.co/datasets/google/fleurs) |
|
- **Size**: 2 hours (test set of the corpus) |
|
- **Domain**: Read speech covering common topics and phrases in Hebrew |
|
- **Source**: Created as part of Google's FLEURS project, designed for multilingual speech tasks and evaluation. Data collected through crowdsourcing from Hebrew speakers. |
|
|
|
### [mozilla-foundation/common_voice_17_0/he](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0) |
|
- **Size**: 2 hours (validated set of the corpus) |
|
- **Domain**: Read sentences in Hebrew from various texts. |
|
- **Source**: Collected through Mozilla's Common Voice initiative, where volunteers contribute recordings and validate other speakers' contributions |
|
|
|
### [imvladikon/hebrew_speech_kan](https://huggingface.co/datasets/imvladikon/hebrew_speech_kan) |
|
- **Size**: 1.7 hours (validation set of the corpus) |
|
- **Domain**: Varied content types from the Kan (Israeli Public Broadcasting Corporation) youtube channel |
|
- **Source**: Published by Vladimir Gurevich. Scraped audio and subtitles data from YouTube channel "ืืื" (Kan). |
|
""" |
|
|
|
|
|
LLM_BENCHMARKS_TEXT = """ |
|
## How it works |
|
Models are evaluated using Word Error Rate (WER) on each benchmark dataset. The final score is an average of WER across all benchmarks, |
|
with lower scores indicating better performance. |
|
|
|
Specifically, evaluation is done using the [jiwer](https://github.com/jitsi/jiwer) library. |
|
Source code for the evaluation can be found [here](https://github.com/ivrit-ai/asr-training/blob/master/evaluate_model.py). |
|
|
|
## Reproducibility |
|
To evaluate your model on these benchmarks, you can use our evaluation script as follows: |
|
|
|
```bash |
|
./evaluate_model.py --engine <engine> --model <model> --dataset <dataset:split:column> [--name <name>] [--workers <num_workers>] |
|
``` |
|
|
|
For example, here's how to evaluate ivrit-ai/faster-whisper-v2-d4 on the google/fleurs/he dataset: |
|
|
|
```bash |
|
./evaluate_model.py --engine faster-whisper --model ivrit-ai/faster-whisper-v2-d4 --name he_il --dataset google/fleurs:test:transcription --workers 1 |
|
``` |
|
|
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
## Submitting a model for evaluation |
|
|
|
### 1) Provide an inference script |
|
To evaluate your model, we need either: |
|
|
|
a) A simple inference script that takes audio input and returns transcribed text: |
|
```python |
|
def transcribe(audio_path: str) -> str: |
|
# Your model loading and inference code here |
|
return transcribed_text |
|
``` |
|
|
|
b) Or augment our evaluate_model.py script with your model's implementation. |
|
|
|
### 2) Make sure your model is publicly accessible |
|
Your model should be available on the Hugging Face Hub with: |
|
- Public visibility |
|
- Clear licensing information |
|
- Basic model card documentation |
|
|
|
### 3) Fill up your model card |
|
Please include in your model card: |
|
- Model architecture |
|
- Training data description |
|
- Licensing information |
|
- Any special preprocessing requirements |
|
- Expected input format (sampling rate, audio format, etc.) |
|
|
|
## In case of evaluation failure |
|
If your model evaluation fails, please: |
|
1. Check that your model can be loaded and run locally |
|
2. Verify your inference script works with our benchmark format |
|
3. Ensure all dependencies are clearly specified |
|
4. Contact us through GitHub issues if problems persist |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r""" |
|
@misc{marmor2023ivritai, |
|
title={ivrit.ai: A Comprehensive Dataset of Hebrew Speech for AI Research and Development}, |
|
author={Yanir Marmor and Kinneret Misgav and Yair Lifshitz}, |
|
year={2023}, |
|
eprint={2307.08720}, |
|
archivePrefix={arXiv}, |
|
primaryClass={eess.AS} |
|
} |
|
""" |
|
|