import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Load model and tokenizer model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Define the correction function def correct_transcription(input_text): # Add task instruction to the input prompt = "Correct the following raw Yiddish HTR output from Transkribus into authentic Yiddish correspondence. Keep period-specific spellings, regional variations, and Hebrew-origin words as they were historically used. Fix only clear OCR errors, such as broken words or nonsensical combinations. Retain proper nouns, place names, abbreviations, and informal language. Maintain original line breaks and formatting:\n" input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids output_ids = model.generate(input_ids, max_length=512) corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) return corrected_text # Gradio Interface with gr.Blocks() as interface: gr.Markdown("### Yiddish Transcription Correction") with gr.Row(): input_box = gr.Textbox(label="Raw Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="input_box") output_box = gr.Textbox(label="Corrected Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="output_box") submit_button = gr.Button("Correct") submit_button.click(correct_transcription, inputs=[input_box], outputs=[output_box]) # Launch the interface interface.launch()