|
import random |
|
from umsc import UgMultiScriptConverter |
|
import string |
|
import epitran |
|
from difflib import SequenceMatcher |
|
|
|
|
|
short_texts = [ |
|
"سالام", "رەھمەت", "ياخشىمۇسىز", "خۇش كېپسىز", "خەيرلىك كۈن", "خەير خوش" |
|
] |
|
long_texts = [ |
|
"مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.", |
|
"يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.", |
|
"بىزنىڭ ئۆيدە تۆت تەكچە، تۆتىلىسى تەك-تەكچە", |
|
"تۆۋەندە ئالىمنىڭ تەرجىمىھالى بىلەن تونۇشۇپ ئۆتەيلى.", |
|
"شېئىردىكى تۇيغۇ ئورنىنى تاپالمىغان ئىستىلىستىكىلىق ۋاسىتە كۆزگە چېلىقمايدۇ." |
|
] |
|
|
|
|
|
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS') |
|
def generate_short_text(script_choice): |
|
"""Generate a random Uyghur short text based on the type.""" |
|
text = random.choice(short_texts) |
|
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text |
|
|
|
def generate_long_text(script_choice): |
|
"""Generate a random Uyghur long text based on the type.""" |
|
text = random.choice(long_texts) |
|
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_pronunciation_accuracy(reference_text, output_text, language_code='uig-Arab'): |
|
""" |
|
Calculate pronunciation accuracy between reference and ASR output text using Epitran. |
|
|
|
Args: |
|
reference_text (str): The ground truth text in Uyghur (Arabic script). |
|
output_text (str): The ASR output text in Uyghur (Arabic script). |
|
language_code (str): Epitran language code (default is 'uig-Arab' for Uyghur). |
|
|
|
Returns: |
|
float: Pronunciation accuracy as a percentage. |
|
str: IPA transliteration of the reference text. |
|
str: IPA transliteration of the output text. |
|
""" |
|
|
|
ipa_converter = epitran.Epitran(language_code) |
|
|
|
|
|
reference_text_clean = remove_punctuation(reference_text) |
|
output_text_clean = remove_punctuation(output_text) |
|
|
|
|
|
reference_ipa = ipa_converter.transliterate(reference_text_clean) |
|
output_ipa = ipa_converter.transliterate(output_text_clean) |
|
|
|
|
|
matcher = SequenceMatcher(None, reference_text_clean, output_text_clean) |
|
match_ratio = matcher.ratio() |
|
|
|
|
|
pronunciation_accuracy = match_ratio * 100 |
|
|
|
|
|
comparison_md = "" |
|
for opcode, i1, i2, j1, j2 in matcher.get_opcodes(): |
|
ref_segment = reference_text_clean[i1:i2] |
|
out_segment = output_text_clean[j1:j2] |
|
|
|
if opcode == 'equal': |
|
comparison_md += f'<span style="color: blue;">{ref_segment}</span>' |
|
elif opcode in ['replace', 'delete', 'insert']: |
|
comparison_md += f'<span style="color: black;">{ref_segment}</span>' |
|
|
|
comparison_md = f"<div>{comparison_md}</div>" |
|
|
|
return reference_ipa, output_ipa, comparison_md, pronunciation_accuracy |
|
|
|
def remove_punctuation(text): |
|
"""Helper function to remove punctuation from text.""" |
|
return text.translate(str.maketrans('', '', string.punctuation)) |