Addaci commited on
Commit
5ba8131
·
verified ·
1 Parent(s): 541340c

Update app.py

Browse files

Changed app.py to include Right to Left display instructions and to append a short systems prompt
prompt = "Correct the following Yiddish transcription to groundtruth Yiddish in Hebrew script:\n"

Files changed (1) hide show
  1. app.py +21 -41
app.py CHANGED
@@ -1,50 +1,30 @@
1
  import gradio as gr
2
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
 
4
- # Load the model and tokenizer
5
  model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10"
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
8
 
9
- # Define the function to correct raw Yiddish text
10
- def correct_yiddish(raw_text):
11
- # System prompt to guide the model
12
- system_prompt = (
13
- "Correct the following raw Yiddish HTR output from Transkribus into historically authentic Yiddish correspondence:\n\n"
14
- "1. Maintain Historical Features:\n"
15
- "- Preserve period-specific spellings and colloquialisms.\n"
16
- "2. Correct Only Clear Errors:\n"
17
- "- Fix obvious OCR misrecognitions, broken words, and nonsensical character combinations.\n"
18
- "3. Preserve Contextual Accuracy:\n"
19
- "- Keep original spellings of proper nouns, place names, and historical abbreviations.\n"
20
- "4. Document Integrity:\n"
21
- "- Respect original line breaks, spacing, and paragraph formatting.\n\n"
22
- "Raw text:\n"
23
- )
24
- full_input = system_prompt + raw_text
25
-
26
- # Tokenize the input text
27
- inputs = tokenizer(full_input, return_tensors="pt", truncation=True, max_length=512)
28
-
29
- # Generate the corrected text
30
- outputs = model.generate(**inputs)
31
- corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
32
-
33
  return corrected_text
34
 
35
- # Create the Gradio interface
36
- interface = gr.Interface(
37
- fn=correct_yiddish,
38
- inputs=gr.Textbox(label="Raw Yiddish Text", placeholder="Enter raw Yiddish text here..."),
39
- outputs=gr.Textbox(label="Corrected Yiddish Text"),
40
- title="Yiddish Text Corrector",
41
- description=(
42
- "This tool uses a fine-tuned ByT5 model to correct raw Yiddish text into historically authentic Yiddish correspondence. "
43
- "Enter raw text in the input box, and the corrected output will appear below."
44
- ),
45
 
46
- )
47
-
48
- # Launch the Gradio app
49
- if __name__ == "__main__":
50
- interface.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
 
4
+ # Load model and tokenizer
5
  model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10"
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
8
 
9
+ # Define the correction function
10
+ def correct_transcription(input_text):
11
+ # Add task instruction to the input
12
+ prompt = "Correct the following Yiddish transcription to groundtruth Yiddish in Hebrew script:\n"
13
+ input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids
14
+ output_ids = model.generate(input_ids, max_length=512)
15
+ corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  return corrected_text
17
 
18
+ # Gradio Interface
19
+ with gr.Blocks() as interface:
20
+ gr.Markdown("### Yiddish Transcription Correction")
21
+
22
+ with gr.Row():
23
+ input_box = gr.Textbox(label="Raw Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="input_box")
24
+ output_box = gr.Textbox(label="Corrected Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="output_box")
25
+
26
+ submit_button = gr.Button("Correct")
27
+ submit_button.click(correct_transcription, inputs=[input_box], outputs=[output_box])
28
 
29
+ # Launch the interface
30
+ interface.launch()