speech-test's picture
Initial commit
47e279a
raw
history blame
5.21 kB
import json
import re
from pathlib import Path
import requests
import streamlit as st
import yaml
from huggingface_hub import hf_hub_download
from streamlit_tags import st_tags
# exact same regex as in the Hub server. Please keep in sync.
REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
with open("languages.json") as f:
lang2name = json.load(f)
def try_parse_yaml(yaml_block):
try:
metadata = yaml.load(yaml_block, yaml.SafeLoader)
except yaml.YAMLError as e:
print("Error while parsing the metadata YAML:")
if hasattr(e, "problem_mark"):
if e.context is not None:
st.error(
str(e.problem_mark)
+ "\n "
+ str(e.problem)
+ " "
+ str(e.context)
+ "\nPlease correct the README.md and retry."
)
else:
st.error(
str(e.problem_mark)
+ "\n "
+ str(e.problem)
+ "\nPlease correct the README.md and retry."
)
else:
st.error(
"Something went wrong while parsing the metadata. "
"Make sure it's written according to the YAML spec!"
)
return None
return metadata
def main():
st.markdown("## 1. Load your model's metadata")
st.markdown("Enter your model's path below.")
model_id = st.text_input("", placeholder="<username>/<model>")
if not model_id.strip():
st.stop()
try:
readme_path = hf_hub_download(model_id, filename="README.md")
except requests.exceptions.HTTPError:
st.error(
f"ERROR: https://huggingface.co/{model_id}/blob/main/README.md "
f"not found, make sure you've entered a correct model path!"
)
st.stop()
content = Path(readme_path).read_text()
match = REGEX_YAML_BLOCK.search(content)
if match:
meta_yaml = match.group(1)
else:
st.error(
"ERROR: Couldn't find the metadata section inside your model's `README.md`. Do you have some basic metadata "
"enclosed in `---` as described in [the Hub documentation](https://huggingface.co/docs/hub/model-repos#model-card-metadata)?"
)
st.stop()
metadata = try_parse_yaml(meta_yaml)
if metadata is None:
st.stop()
else:
st.success("Successfully loaded the metadata!")
with st.expander("Inspect the parsed metadata for debugging"):
st.json(metadata)
st.markdown("## 2. Edit the data")
############################
# LANGUAGES
############################
st.markdown("### Language(s)")
st.markdown(
"For each spoken language that your model handles, enter an "
"[ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) language code, or "
"find an appropriate alternative from "
"[our list here](https://huggingface.co/spaces/huggingface/hf-speech-bench/blob/main/languages.json). "
"When in doubt, use the most generic language code, e.g. `en` instead of `en-GB` and `en-US`."
)
st.markdown("*Example*: `cs, hsb, pl`")
metadata["language"] = metadata["language"] if "language" in metadata else []
metadata["language"] = (
metadata["language"]
if isinstance(metadata["language"], list)
else [metadata["language"]]
)
languages = st_tags(
label="", text="add more if needed, and press enter", value=metadata["language"]
)
lang_names = [lang2name[lang] if lang in lang2name else lang for lang in languages]
st.markdown("These languages will be parsed by the leaderboard as: ")
st.code(", ".join(lang_names))
############################
# TRAIN DATASETS
############################
st.markdown("### Training dataset(s)")
st.markdown("List the datasets that your model was trained on.")
st.markdown("*Example*: `librispeech_asr, mozilla-foundation/common_voice_8_0`")
if "datasets" not in metadata:
metadata["datasets"] = []
train_datasets = st_tags(
label="", text="add more if needed, and press enter", value=metadata["datasets"]
)
if "common_voice" in train_datasets:
st.warning(
"WARNING: `common_voice` is deprecated, please replace it with its equivalent: "
"`mozilla-foundation/common_voice_6_1`"
)
############################
# MODEL NAME
############################
st.markdown("### Model name")
st.markdown("Enter a descriptive name for your model.")
st.markdown("*Example*: `XLS-R Wav2Vec2 LM Spanish by Jane Doe`")
if "model_index" not in metadata:
metadata["model_index"] = [{}]
if "name" not in ["model_index"][0]:
metadata["model_index"][0]["name"] = model_id.split("/")[-1]
model_name = st.text_input("", value=metadata["model_index"][0]["name"])
############################
# EVAL DATASETS
############################
st.markdown("### Evaluation metrics")
if __name__ == "__main__":
main()