Spaces:

MarineLives
/

yiddish-transcription-correction

Running

Addaci commited on Dec 9, 2024

Commit

bdbc86a

verified ·

1 Parent(s): 5ba8131

Update app.py

Added new shortened prompt provided by Gavin

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 # Define the correction function
 def correct_transcription(input_text):
     # Add task instruction to the input
-    prompt = "Correct the following Yiddish transcription to groundtruth Yiddish in Hebrew script:\n"
     input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids
     output_ids = model.generate(input_ids, max_length=512)
     corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

 # Define the correction function
 def correct_transcription(input_text):
     # Add task instruction to the input
+    prompt = "Correct the following raw Yiddish HTR output from Transkribus into authentic Yiddish correspondence. Keep period-specific spellings, regional variations, and Hebrew-origin words as they were historically used. Fix only clear OCR errors, such as broken words or nonsensical combinations. Retain proper nouns, place names, abbreviations, and informal language. Maintain original line breaks and formatting:\n"
     input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids
     output_ids = model.generate(input_ids, max_length=512)
     corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)