from pprint import pformat

import gradio as gr
import librosa
from huggingface_hub import hf_hub_download

from pipeline import PreTrainedPipeline

HF_HUB_URL = "ales/wav2vec2-cv-be"
LM_HUB_FP = "language_model/cv8be_5gram.bin"
MODEL_SAMPLING_RATE = 16_000  # 16kHz

# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)


def main(recorded_audio_fp: str | None, uploaded_audio_fp: str | None):
    audio_fp = None
    if recorded_audio_fp is not None:
        audio_fp = recorded_audio_fp
        used_audiofile = "recorded"
    elif uploaded_audio_fp is not None:
        audio_fp = uploaded_audio_fp
        used_audiofile = "uploaded"
    else:
        return (
            "Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.",
            "Error! You have to either record or upload an audiofile.",
        )

    # read audio file
    inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]

    # recognize speech
    pipeline_res = pipeline(inputs=inputs)
    text = pipeline_res["text"][0]  # unpack batch of size 1

    # add technical information to the output
    tech_data = pipeline_res
    del tech_data["text"]
    tech_data["used_audiofile"] = used_audiofile
    tech_data["recorded_file_present"] = recorded_audio_fp is not None
    tech_data["uploaded_file_present"] = uploaded_audio_fp is not None
    tech_data["audiofile_path"] = audio_fp
    tech_data["model_sampling_rate"] = MODEL_SAMPLING_RATE
    tech_data["inputs_shape"] = inputs.shape
    tech_data["inputs_max"] = inputs.max().item()
    tech_data["inputs_min"] = inputs.min().item()

    tech_data_str = pformat(tech_data)

    return text, tech_data_str


article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)

![Page Visits](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson&left_text=Page%20Visits)
"""

iface = gr.Interface(
    fn=main,
    inputs=[
        gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="Запішыце аўдыяфайл, каб распазнаць маўленьне",
        ),
        gr.Audio(
            sources=["upload"],
            type="filepath",
            label="Альбо загрузіце ўжо запісаны аўдыяфайл сюды",
        ),
    ],
    outputs=[
        gr.Textbox(label="Распазнаны тэкст"),
        gr.Textbox(label="Тэхнічная інфармацыя"),
    ],
    title="wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model",
    description=(
        "Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n"
        "Акустычная мадэль + моўная мадэль."
    ),
    article=article,
)

iface.launch()