Spaces:

MarineLives
/

yiddish-transcription-correction

Sleeping

App Files Files Community

Addaci commited on Dec 9, 2024

Commit

5ba8131

verified ·

1 Parent(s): 541340c

Update app.py

Browse files

Changed app.py to include Right to Left display instructions and to append a short systems prompt
prompt = "Correct the following Yiddish transcription to groundtruth Yiddish in Hebrew script:\n"

Files changed (1) hide show

app.py +21 -41

app.py CHANGED Viewed

@@ -1,50 +1,30 @@
 import gradio as gr
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-# Load the model and tokenizer
 model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-# Define the function to correct raw Yiddish text
-def correct_yiddish(raw_text):
-    # System prompt to guide the model
-    system_prompt = (
-        "Correct the following raw Yiddish HTR output from Transkribus into historically authentic Yiddish correspondence:\n\n"
-        "1. Maintain Historical Features:\n"
-        "- Preserve period-specific spellings and colloquialisms.\n"
-        "2. Correct Only Clear Errors:\n"
-        "- Fix obvious OCR misrecognitions, broken words, and nonsensical character combinations.\n"
-        "3. Preserve Contextual Accuracy:\n"
-        "- Keep original spellings of proper nouns, place names, and historical abbreviations.\n"
-        "4. Document Integrity:\n"
-        "- Respect original line breaks, spacing, and paragraph formatting.\n\n"
-        "Raw text:\n"
-    )
-    full_input = system_prompt + raw_text
-    # Tokenize the input text
-    inputs = tokenizer(full_input, return_tensors="pt", truncation=True, max_length=512)
-    # Generate the corrected text
-    outputs = model.generate(**inputs)
-    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return corrected_text
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=correct_yiddish,
-    inputs=gr.Textbox(label="Raw Yiddish Text", placeholder="Enter raw Yiddish text here..."),
-    outputs=gr.Textbox(label="Corrected Yiddish Text"),
-    title="Yiddish Text Corrector",
-    description=(
-        "This tool uses a fine-tuned ByT5 model to correct raw Yiddish text into historically authentic Yiddish correspondence. "
-        "Enter raw text in the input box, and the corrected output will appear below."
-    ),
-)
-# Launch the Gradio app
-if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# Load model and tokenizer
 model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Define the correction function
+def correct_transcription(input_text):
+    # Add task instruction to the input
+    prompt = "Correct the following Yiddish transcription to groundtruth Yiddish in Hebrew script:\n"
+    input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids
+    output_ids = model.generate(input_ids, max_length=512)
+    corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return corrected_text
+# Gradio Interface
+with gr.Blocks() as interface:
+    gr.Markdown("### Yiddish Transcription Correction")
+    with gr.Row():
+        input_box = gr.Textbox(label="Raw Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="input_box")
+        output_box = gr.Textbox(label="Corrected Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="output_box")
+    submit_button = gr.Button("Correct")
+    submit_button.click(correct_transcription, inputs=[input_box], outputs=[output_box])
+# Launch the interface
+interface.launch()