diff --git "a/uroman/bin/uroman.py" "b/uroman/bin/uroman.py" new file mode 100644--- /dev/null +++ "b/uroman/bin/uroman.py" @@ -0,0 +1,2278 @@ +#!/usr/bin/env python + +""" +Written by Ulf Hermjakob, USC/ISI March-April 2024 +uroman is a universal romanizer. It converts text in any script to the Latin alphabet. +This script is a Python reimplementation of an earlier Perl script, with some improvements. +The tool has been tested on 250 languages, with 100 or more sentences each. +This script is still under development and large-scale testing. Feedback welcome. +This script provides token-size caching (for faster runtimes). +Output formats include + (1) best romanization string + (2) best romanization edges ("best path"; incl. start and end positions with respect to the original string) + (3) best romanization with alternatives (as applicable for ambiguous romanization) + (4) best romanization full lattice (all edges, including superseded sub-edges) +See below for 'sample calls' under main() +""" + + +from __future__ import annotations +import argparse +from collections import defaultdict +# from memory_profiler import profile +import datetime +from enum import Enum +from fractions import Fraction +import gc +import json +import math +import os +import pathlib +from pathlib import Path +import pstats +import regex +import sys +from typing import List, Tuple +import unicodedata as ud +PROFILE_FLAG = "--profile" # also used in argparse processing +if PROFILE_FLAG in sys.argv: + import cProfile + +# UTILITIES + + +def timer(func): + def wrapper(*args, **kwargs): + start_time = datetime.datetime.now() + print(f"Calling: {func.__name__}{args}") + print(f"Start time: {start_time:%A, %B %d, %Y at %H:%M}") + result = func(*args, **kwargs) + end_time = datetime.datetime.now() + time_diff = (end_time-start_time).total_seconds() + print(f"End time: {end_time:%A, %B %d, %Y at %H:%M}") + print(f"Duration: {time_diff} seconds") + return result + return wrapper + + +def slot_value_in_double_colon_del_list(line: str, slot: str, default: str | list | None = None) -> str | list | None: + """For a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3 + The value can be an empty string, as for ::s2 in the example above.""" + m = regex.match(fr'(?:.*\s)?::{slot}(|\s+\S.*?)(?:\s+::\S.*|\s*)$', line) + return m.group(1).strip() if m else default + + +def has_value_in_double_colon_del_list(line: str, slot: str) -> bool: + return isinstance(slot_value_in_double_colon_del_list(line, slot), str) + + +def dequote_string(s: str) -> str: + if isinstance(s, str): + m = regex.match(r'''\s*(['"“])(.*)(['"”])\s*$''', s) + if m and ((m.group(1) + m.group(3)) in ("''", '""', '“”')): + return m.group(2) + return s + + +def last_chr(s: str) -> str: + if len(s): + return s[len(s)-1] + else: + '' + + +def ud_numeric(char: str) -> int | float | None: + try: + num_f = ud.numeric(char) + return int(num_f) if num_f.is_integer() else num_f + except (ValueError, TypeError): + return None + + +def robust_str_to_num(num_s: str, filename: str = None, line_number: int | None = None, silent: bool = False) \ + -> int | float | None: + if isinstance(num_s, str): + try: + return float(num_s) if "." in num_s else int(num_s) + except ValueError: + if not silent: + sys.stderr.write(f'Cannot convert "{num_s}" to a number') + if line_number: + sys.stderr.write(f' line: {line_number}') + if filename: + sys.stderr.write(f' file: {filename}') + sys.stderr.write(f'\n') + elif isinstance(num_s, float) or isinstance(num_s, int): + return num_s + return None + + +def first_non_none(*args): + for arg in args: + if arg is not None: + return arg + return None + + +def any_not_none(*args) -> bool: + for arg in args: + if arg is not None: + return True + return False + + +def add_non_none_to_dict(d: dict, key: str, value) -> None: + if value is not None: + d[key] = value + + +def fraction_char2fraction(fraction_char: str, fraction_value: float | None = None, + uroman: Uroman | None = None) -> Fraction | None: + s = '' + fraction = None + for ud_decomp_elem in ud.decomposition(fraction_char).split(): + try: + s += chr(int(ud_decomp_elem, 16)) + except ValueError: + s += ud_decomp_elem + if m := regex.match(r'(\d+)⁄(\d+)$', s): + numerator_s, denominator_s = m.group(1, 2) + try: + fraction = Fraction(int(numerator_s), int(denominator_s)) + except ValueError: + fraction = None + if (fraction is None) and uroman and fraction_value: + if num_denom := uroman.unicode_float2fraction(fraction_value): + try: + fraction = Fraction(num_denom[0], num_denom[1]) + except ValueError: + fraction = None + return fraction + + +def chr_name(char: str) -> str: + """robust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txt""" + try: + return ud.name(char) + except (ValueError, TypeError): + return '' + + +def args_get(key: str, args: argparse.Namespace | None = None): + return vars(args)[key] if args and (key in args) else None + + +class DictClass: + def __init__(self, **kw_args): + for kw_arg in kw_args: + kw_arg2 = kw_arg.replace('_', '-') + value = kw_args[kw_arg] + if not (value in (None, [], False)): + self.__dict__[kw_arg2] = value + + def __repr__(self): + return str(self.__dict__) + + def __getitem__(self, key, default=None): + return self.__dict__[key] if key in self.__dict__ else default + + def __bool__(self): + return len(self.__dict__) > 0 + + +class RomRule(DictClass): + # key: source string + # typical attributes: s (source), t (target), prov (provenance), lcodes (language codes) + # t_alts=t_alts (target alternatives), use_only_at_start_of_word, dont_use_at_start_of_word, + # use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word + pass + + +class Script(DictClass): + # key: lower case script_name + # typical attributes: script_name, direction, abugida_default_vowels, alt_script_names, languages + pass + + +class RomFormat(Enum): + """Output format of romanization""" + STR = 'str' # simple string + EDGES = 'edges' # list of edges (includes character offsets in original string) + ALTS = 'alts' # lattice including alternative edges + LATTICE = 'lattice' # lattice including alternative and superseded edges + + def __str__(self): + return self.value + + +class Uroman: + """This class loads and maintains uroman data independent of any specific text corpus. + Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.) + Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file + (romanize_file()).""" + def __init__(self, data_dir: Path, **args): # args: load_log, rebuild_ud_props + self.data_dir = data_dir + self.rom_rules = defaultdict(list) + self.scripts = defaultdict(Script) + self.dict_bool = defaultdict(bool) + self.dict_str = defaultdict(str) + self.dict_int = defaultdict(int) + self.dict_num = defaultdict(lambda: None) # values are int (most common), float, or str ("1/2") + # num_props key: txt + # values: {"txt": "\u137b", "rom": "100", "value": 100, "type": "base", "mult": 1, "script": "Ethiopic"} + self.num_props = defaultdict(dict) + self.dict_set = defaultdict(set) + self.float2fraction = {} # caching + gc.disable() + self.load_resource_files(data_dir, args.get('load_log', False), + args.get('rebuild_ud_props', False), + args.get('rebuild_num_props', False)) + gc.enable() + self.hangul_rom = {} + self.rom_cache = {} # key: (s, lcode) value: t + self.stats = defaultdict(int) # stats, e.g. for unprocessed numbers + self.abugida_cache = {} # key: (script, char_rom) value: (base_rom, base_rom_plus_abugida_vowel, modified rom) + + def second_rom_filter(self, c: str, rom: str, name: str | None) -> Tuple[str | None, str]: + """Much of this code will eventually move the old Perl code to generate cleaner primary data""" + if rom and (' ' in rom): + if name is None: + name = self.chr_name(c) + if "MYANMAR VOWEL SIGN KAYAH" in name: + if m := regex.search(r'kayah\s+(\S+)\s*$', rom): + return m.group(1), name + if "MENDE KIKAKUI SYLLABLE" in name: + if m := regex.search(r'm\d+\s+(\S+)\s*$', rom): + return m.group(1), name + if regex.search(r'\S\s+\S', rom): + return c, name + return None, name + + def load_rom_file(self, filename: str, provenance: str, file_format: str = None, load_log: bool = True): + """Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt + which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects" + some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually + created and allows complex romanization rules, some for specific languages, some for specific contexts.""" + n_entries = 0 + try: + f = open(filename) + except FileNotFoundError: + sys.stderr.write(f'Cannot open file {filename}\n') + return + with (f): + for line_number, line in enumerate(f, 1): + if line.startswith('#'): + continue + if regex.match(r'^\s*$', line): # blank line + continue + line = regex.sub(r'\s{2,}#.*$', '', line) + if file_format == 'u2r': + t_at_end_of_syllable = None + u = dequote_string(slot_value_in_double_colon_del_list(line, 'u')) + try: + cp = int(u, 16) + s = chr(cp) + except ValueError: + continue + t = dequote_string(slot_value_in_double_colon_del_list(line, 'r')) + if name := slot_value_in_double_colon_del_list(line, 'name'): + self.dict_str[('name', s)] = name + if pic := slot_value_in_double_colon_del_list(line, 'pic'): + self.dict_str[('pic', s)] = pic + if tone_mark := slot_value_in_double_colon_del_list(line, 'tone-mark'): + self.dict_str[('tone-mark', s)] = tone_mark + if syllable_info := slot_value_in_double_colon_del_list(line, 'syllable-info'): + self.dict_str[('syllable-info', s)] = syllable_info + else: + s = dequote_string(slot_value_in_double_colon_del_list(line, 's')) + t = dequote_string(slot_value_in_double_colon_del_list(line, 't')) + t_at_end_of_syllable = dequote_string(slot_value_in_double_colon_del_list(line, + 't-end-of-syllable')) + if (num_s := slot_value_in_double_colon_del_list(line, 'num')) is not None: + num = robust_str_to_num(num_s) + self.dict_num[s] = (num_s if (num is None) else num) + is_minus_sign = has_value_in_double_colon_del_list(line, 'is-minus-sign') + is_plus_sign = has_value_in_double_colon_del_list(line, 'is-plus-sign') + is_decimal_point = has_value_in_double_colon_del_list(line, 'is-decimal-point') + is_large_power = has_value_in_double_colon_del_list(line, 'is-large-power') + fraction_connector = slot_value_in_double_colon_del_list(line, 'fraction-connector') + percentage_marker = slot_value_in_double_colon_del_list(line, 'percentage-marker') + int_frac_connector = slot_value_in_double_colon_del_list(line, 'int-frac-connector') + lcode_s = slot_value_in_double_colon_del_list(line, 'lcode') + lcodes = regex.split(r'[,;]\s*', lcode_s) if lcode_s else [] + use_only_at_start_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-start-of-word') + dont_use_at_start_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-start-of-word') + use_only_at_end_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-end-of-word') + dont_use_at_end_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-end-of-word') + use_only_for_whole_word = has_value_in_double_colon_del_list(line, 'use-only-for-whole-word') + num_s = slot_value_in_double_colon_del_list(line, 'num') + num = robust_str_to_num(num_s, filename, line_number, silent=False) + t_alt_s = slot_value_in_double_colon_del_list(line, 't-alt') + t_alts = regex.split(r'[,;]\s*', t_alt_s) if t_alt_s else [] + t_alts = list(map(dequote_string, t_alts)) + t_mod, name2 = self.second_rom_filter(s, t, None) + if t_mod and (t_mod != t): + if t != s: + pass # sys.stderr.write(f'UPDATE: {s} {name2} {t} -> {t_mod}\n') + t = t_mod + if s is not None: + for bool_key in ('is-large-power', 'is-minus-sign', 'is-plus-sign', 'is-decimal-point'): + bool_value = eval(bool_key.replace('-', '_')) + if bool_value: + self.dict_bool[(bool_key, s)] = True + if any_not_none(t, num, is_minus_sign, is_plus_sign, is_decimal_point, is_large_power, + fraction_connector, percentage_marker, int_frac_connector): + self.register_s_prefix(s) + n_entries += 1 + # if regex.match(r'[\u2800-\u28FF]', s): print("Braille", s, t) + restrictions = [lcodes, use_only_at_start_of_word, dont_use_at_start_of_word, + use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word] + n_restrictions = len([restr for restr in restrictions if restr]) + provenance2 = provenance + if (t is None) and (num is not None) and (provenance2 == "rom"): + provenance2 = "num" + new_rom_rule = RomRule(s=s, t=t, prov=provenance2, lcodes=lcodes, t_alts=t_alts, num=num, + use_only_at_start_of_word=use_only_at_start_of_word, + dont_use_at_start_of_word=dont_use_at_start_of_word, + use_only_at_end_of_word=use_only_at_end_of_word, + dont_use_at_end_of_word=dont_use_at_end_of_word, + use_only_for_whole_word=use_only_for_whole_word, + t_at_end_of_syllable=t_at_end_of_syllable, + n_restr=n_restrictions, + is_minus_sign=is_minus_sign, + is_plus_sign=is_plus_sign, + is_decimal_point=is_decimal_point, + fraction_connector=fraction_connector, + percentage_marker=percentage_marker, + int_frac_connector=int_frac_connector, + is_large_power=is_large_power) + old_rom_rules = self.rom_rules[s] + if ((len(old_rom_rules) == 1) and (old_rom_rules[0]['prov'] in ('ud', 'ow')) + and not (lcodes or use_only_at_start_of_word or dont_use_at_start_of_word + or use_only_at_end_of_word or dont_use_at_end_of_word + or use_only_for_whole_word)): + self.rom_rules[s] = [new_rom_rule] # overwrite + else: + self.rom_rules[s].append(new_rom_rule) + # Thai + thai_cancellation_mark = '\u0E4C' + # cancellation applies to preceding letter incl. any vowel modifier letter (e.g. ศักดิ์สิทธิ์ -> saksit) + for cp in range(0x0E01, 0x0E4C): # Thai + c = chr(cp) + s = c + thai_cancellation_mark + new_rom_rule = RomRule(s=s, t='', prov='auto cancel letter') + if not self.rom_rules[s]: + self.rom_rules[s] = [new_rom_rule] + self.register_s_prefix(s) + thai_consonants = list(map(chr, range(0x0E01, 0x0E2F))) + thai_vowel_modifiers = ['\u0E31', '\u0E47'] + list(map(chr, range(0x0E33, 0x0E3B))) + for c1 in thai_consonants: + for v in thai_vowel_modifiers: + s = c1 + v + thai_cancellation_mark + new_rom_rule = RomRule(s=s, t='', prov='auto cancel syllable') + if not self.rom_rules[s]: + self.rom_rules[s] = [new_rom_rule] + self.register_s_prefix(s) + if load_log: + sys.stderr.write(f'Loaded {n_entries} from {filename}\n') + + def load_script_file(self, filename: str, load_log: bool = True): + """Reads in (typically from Scripts.txt) information about various scripts such as Devanagari, + incl. information such as the default abugida vowel letter (e.g. "a").""" + n_entries, max_n_script_name_components = 0, 0 + try: + f = open(filename) + except FileNotFoundError: + sys.stderr.write(f'Cannot open file {filename}\n') + return + with f: + for line_number, line in enumerate(f, 1): + if line.startswith('#'): + continue + if regex.match(r'^\s*$', line): # blank line + continue + line = regex.sub(r'\s{2,}#.*$', '', line) + if script_name := slot_value_in_double_colon_del_list(line, 'script-name'): + lc_script_name = script_name.lower() + if lc_script_name in self.scripts: + sys.stderr.write(f'** Ignoring duplicate script "{script_name}" ' + f'in line {line_number} of {filename}\n') + else: + n_entries += 1 + direction = slot_value_in_double_colon_del_list(line, 'direction') + abugida_default_vowel_s = slot_value_in_double_colon_del_list(line, + 'abugida-default-vowel') + abugida_default_vowels = regex.split(r'[,;]\s*', abugida_default_vowel_s) \ + if abugida_default_vowel_s else [] + alt_script_name_s = slot_value_in_double_colon_del_list(line, 'alt-script-name') + alt_script_names = regex.split(r'[,;]\s*', alt_script_name_s) if alt_script_name_s else [] + language_s = slot_value_in_double_colon_del_list(line, 'language') + languages = regex.split(r'[,;]\s*', language_s) if language_s else [] + new_script = Script(script_name=script_name, alt_script_names=alt_script_names, + languages=languages, direction=direction, + abugida_default_vowels=abugida_default_vowels) + self.scripts[lc_script_name] = new_script + for language in languages: + self.dict_set[('scripts', language)].add(script_name) + for alt_script_name in alt_script_names: + lc_alt_script_name = alt_script_name.lower() + if lc_alt_script_name in self.scripts: + sys.stderr.write(f'** Ignoring duplicate alternative script name "{script_name}" ' + f'in line {line_number} of {filename}\n') + else: + self.scripts[lc_alt_script_name] = new_script + n_script_name_components = len(script_name.split()) + if n_script_name_components > max_n_script_name_components: + max_n_script_name_components = n_script_name_components + if max_n_script_name_components: + self.dict_int['max_n_script_name_components'] = max_n_script_name_components + if load_log: + sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}' + f' (max_n_scripts_name_components: {max_n_script_name_components})\n') + + def extract_script_name(self, script_name_plus: str, full_char_name: str = None) -> str | None: + """Using info from Scripts.txt, this script selects the script name from a Unicode, + e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian".""" + if full_char_name and script_name_plus == full_char_name: + return None + while script_name_plus: + if script_name_plus.lower() in self.scripts: + if script := self.scripts[script_name_plus.lower()]: + if script_name := script['script-name']: + return script_name + script_name_plus = regex.sub(r'\s*\S*\s*$', '', script_name_plus) + return None + + def load_unicode_data_props(self, filename: str, load_log: bool = True): + """Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt + and UnicodeDataPropsCJK.txt with a list of valid script-specific characters.""" + n_script, n_script_char, n_script_vowel_sign, n_script_medial_consonant_sign, n_script_virama = 0, 0, 0, 0, 0 + try: + f = open(filename) + except FileNotFoundError: + sys.stderr.write(f'Cannot open file {filename}\n') + return + with f: + for line_number, line in enumerate(f, 1): + if line.startswith('#'): + continue + if regex.match(r'^\s*$', line): # blank line + continue + line = regex.sub(r'\s{2,}#.*$', '', line) + if script_name := slot_value_in_double_colon_del_list(line, 'script-name'): + n_script += 1 + for char in slot_value_in_double_colon_del_list(line, 'char', []): + self.dict_str[('script', char)] = script_name + n_script_char += 1 + for char in slot_value_in_double_colon_del_list(line, 'numeral', []): + self.dict_str[('script', char)] = script_name + n_script_char += 1 + for char in slot_value_in_double_colon_del_list(line, 'vowel-sign', []): + self.dict_bool[('is-vowel-sign', char)] = True + n_script_vowel_sign += 1 + for char in slot_value_in_double_colon_del_list(line, 'medial-consonant-sign', []): + self.dict_bool[('is-medial-consonant-sign', char)] = True + n_script_medial_consonant_sign += 1 + for char in slot_value_in_double_colon_del_list(line, 'sign-virama', []): + self.dict_bool[('is-virama', char)] = True + n_script_virama += 1 + if load_log: + sys.stderr.write(f'Loaded from {filename} mappings of {n_script_char:,d} characters ' + f'to {n_script} script{"" if n_script == 1 else "s"}') + if n_script_vowel_sign or n_script_virama or n_script_medial_consonant_sign: + sys.stderr.write(f', with a total of {n_script_vowel_sign} vowel signs, ' + f'{n_script_medial_consonant_sign} medial consonant signs ' + f'and {n_script_virama} viramas') + sys.stderr.write('.\n') + + def load_num_props(self, filename: str, load_log: bool = True): + """Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt + and UnicodeDataPropsCJK.txt with a list of valid script-specific characters.""" + n_entries = 0 + try: + f = open(filename) + except FileNotFoundError: + sys.stderr.write(f'Cannot open file {filename}\n') + return + with f: + for line_number, line in enumerate(f, 1): + if line.startswith('#'): + continue + if regex.match(r'^\s*$', line): # blank line + continue + d = json.loads(line) + if isinstance(d, dict): + if txt := d.get('txt'): + self.num_props[txt] = d + n_entries += 1 + else: + sys.stderr.write(f'Missing txt in l.{line_number} in file {filename}: {line.strip()}\n') + for bool_key in ('is-large-power',): + if d.get(bool_key): + self.dict_bool[(bool_key, txt)] = True + else: + sys.stderr.write(f'json in l.{line_number} in file {filename} not a dict: {line.strip()}\n') + if load_log: + sys.stderr.write(f'Loaded {n_entries} entries from {filename}\n') + + @staticmethod + def de_accent_pinyin(s: str) -> str: + """De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt).""" + result = '' + for char in s: + if decomp := ud.decomposition(char).split(): + try: + decomp_chars = [chr(int(x, 16)) for x in decomp] + letters = [x for x in decomp_chars if ud.category(x).startswith('L')] + except ValueError: + sys.stderr.write(f'Cannot decode {decomp}\n') + continue + if len(letters) == 1: + result += letters[0] + else: + sys.stderr.write(f'Cannot decode {decomp} (expected 1 letter)\n') + else: + result += char + result = result.replace('ü', 'u') + return result + + def register_s_prefix(self, s: str): + for prefix_len in range(1, len(s) + 1): + self.dict_bool[('s-prefix', s[:prefix_len])] = True + + def load_chinese_pinyin_file(self, filename: str, load_log: bool = True): + """Loads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form.""" + n_entries = 0 + try: + f = open(filename) + except FileNotFoundError: + sys.stderr.write(f'Cannot open file {filename}\n') + return + with f: + for line_number, line in enumerate(f, 1): + if line.startswith('#'): + continue + if regex.match(r'^\s*$', line): # blank line + continue + try: + chinese, pinyin = line.rstrip().split() + rom = self.de_accent_pinyin(pinyin) + except ValueError: + sys.stderr.write(f'Cannot process line {line_number} in file {filename}: {line}') + else: + s = chinese + new_rom_rule = RomRule(s=s, t=rom, prov='rom pinyin', lcodes=[]) + self.rom_rules[chinese].append(new_rom_rule) + self.register_s_prefix(s) + n_entries += 1 + if load_log: + sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}\n') + + @staticmethod + def add_char_to_rebuild_unicode_data_dict(d: dict, script_name: str, prop_class: str, char: str): + d['script-names'].add(script_name) + key = (script_name, prop_class) + if key in d: + d[key].append(char) + else: + d[key] = [char] + + def rebuild_unicode_data_props(self, out_filename: str, cjk: str = None, hangul: str = None): + """This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt + version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt + Regular users normally never have to call this function.""" + d = {'script-names': set()} + n_script_refs = 0 + codepoint = -1 + prop_classes = {'char'} + while codepoint < 0xF0000: + codepoint += 1 + c = chr(codepoint) + if not (char_name := self.chr_name(c)): + continue + for prop_name_comp2 in ('VOWEL SIGN', + ('MEDIAL CONSONANT SIGN', 'CONSONANT SIGN MEDIAL', 'CONSONANT SIGN SHAN MEDIAL', + 'CONSONANT SIGN MON MEDIAL'), + ('SIGN VIRAMA', 'SIGN ASAT', 'AL-LAKUNA', 'SIGN COENG', 'SIGN PAMAAEH', + 'CHARACTER PHINTHU'), + ('NUMERAL', 'NUMBER', 'DIGIT', 'FRACTION')): + if prop_name_comp2 and isinstance(prop_name_comp2, tuple): + prop_list = prop_name_comp2 + else: + prop_list = (prop_name_comp2,) + for prop_name_comp in prop_list: + prop_class = prop_list[0].lower().replace(' ', '-') + if prop_class not in prop_classes: + prop_classes.add(prop_class) + script_name_cand = regex.sub(fr'\s+{prop_name_comp}\b.*$', '', char_name) + if script_name := self.extract_script_name(script_name_cand, char_name): + self.add_char_to_rebuild_unicode_data_dict(d, script_name, prop_class, c) + script_name_cand = regex.sub(r'\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL|' + r'IDEOGRAPH|HIEROGLYPH|POINT|ACCENT|CHARACTER|TIPPI|ADDAK|IRI|URA|' + r'SYMBOL GENITIVE|SYMBOL COMPLETED|SYMBOL LOCATIVE|SYMBOL AFOREMENTIONED|' + r'AU LENGTH MARK)\b.*$', '', + char_name) + if script_name := self.extract_script_name(script_name_cand, char_name): + self.add_char_to_rebuild_unicode_data_dict(d, script_name, 'char', c) + n_script_refs += 1 + # print(sorted(d['script-names'])) + prop_classes = sorted(prop_classes) + out_filenames = [x for x in [out_filename, cjk, hangul] if x] + cjk2 = cjk if cjk else out_filename + hangul2 = hangul if hangul else out_filename + for out_file in out_filenames: + try: + f_out = open(out_file, 'w') + except OSError: + sys.stderr.write(f'Cannot write to file {out_file}\n') + continue + with f_out: + for script_name in sorted(d['script-names']): + if script_name == 'CJK': + if out_file != cjk2: + continue + elif script_name == 'Hangul': + if out_file != hangul2: + continue + else: + if out_file != out_filename: + continue + prop_components = [f"::script-name {script_name}"] + for prop_class in prop_classes: + key = (script_name, prop_class) + if key in d: + if chars := ''.join(d[key]): + if prop_class in ('char',): + prop_components.append(f"::n-{prop_class} {len(chars)}") + prop_components.append(f"::{prop_class} {chars}") + f_out.write(f"{' '.join(prop_components)}\n") + sys.stderr.write(f"Rebuilt {out_filenames} with {n_script_refs} characters " + f"for {len(d['script-names'])} scripts.\n") + + def rebuild_num_props(self, out_filename: str, err_filename: str): + n_out, n_err = 0, 0 + with open(out_filename, 'w') as f_out, open(err_filename, 'w') as f_err: + codepoint = -1 + while codepoint < 0xF0000: + codepoint += 1 + char = chr(codepoint) + num = first_non_none(ud_numeric(char), # robust ud.numeric + self.num_value(char)) # uroman table includes extra num values, e.g. for Egyptian + if num is None: + continue + result_dict = {} + orig_txt = char + value: int | float | None = None # non-fraction-value(3 1/2) = 3 + fraction: Fraction | None = None # fraction(3 1/2) = Fraction(1, 2) + num_base = None # num_base(500) = 100 + base_multiplier = None # base_multiplier(500) = 5 + script = None + is_large_power = self.dict_bool[('is-large-power', char)] + # num_base is typically a power of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, ... + # exceptions might include 12 for the 'dozen' in popular English 'two dozen and one' (2*12+1=25) + # exceptions might include 20 for the 'score' in archaic English 'four score and seven' (4*20+7=87) + # exceptions might include 20 for the 'vingt' as in standard French 'quatre-vingt-treize' (4*20+13=93) + if script_name := self.chr_script_name(char): + script = script_name + elif char in '0123456789': + script = 'ascii-digit' + name = self.chr_name(char) + exclude_from_number_processing = False + for scrypt_type in ('SUPERSCRIPT', 'SUBSCRIPT', + 'CIRCLED', 'PARENTHESIZED', 'SEGMENTED', 'MATHEMATICAL', 'ROMAN NUMERAL', + 'FULL STOP', 'COMMA'): + if scrypt_type in name: + script = '*' + scrypt_type.lower().replace(' ', '-') + exclude_from_number_processing = True + break + for scrypt_type in ('VULGAR FRACTION',): + if scrypt_type in name: + script = scrypt_type.lower().replace(' ', '-') + break + if exclude_from_number_processing: + continue + if isinstance(num, int): + value = num + if 0 <= num <= 9: + num_base = 1 + base_multiplier = num + if "DIGIT" in name: + num_type = 'digit' + else: + # Chinese numbers 零 (0), 一 (1), ... 九 (9) have numeric values, + # but are NOT (full) digits + num_type = 'digit-like' + elif m := regex.match(r'([0-9]+?)(0*)$', str(num)): + base_multiplier = int(m.group(1)) # non_base_value(500) = 5 + num_base = int('1' + m.group(2)) + num_type = 'base' if base_multiplier == 1 else 'multi' + else: + num_type = 'other-int' # Do such cases exist? + elif ("FRACTION" in name) and (fraction := fraction_char2fraction(char, num, self)): + fraction = fraction + num_type = 'fraction' + else: + num_type = 'other-num' # Do such cases exist? Yes. Bengali currency numerators, ... + value_s = '' if value is None else str(value) + fraction_s = '' if fraction is None else f'{fraction.numerator}/{fraction.denominator}' + fraction_list = None if fraction is None else [fraction.numerator, fraction.denominator] + delimiter_s = ' ' if value_s and fraction_s else '' + rom = (value_s + delimiter_s + fraction_s) or orig_txt + add_non_none_to_dict(result_dict, 'txt', orig_txt) + add_non_none_to_dict(result_dict, 'rom', rom) + add_non_none_to_dict(result_dict, 'value', value) + add_non_none_to_dict(result_dict, 'fraction', fraction_list) + add_non_none_to_dict(result_dict, 'type', num_type) + if is_large_power: + result_dict['is-large-power'] = True + add_non_none_to_dict(result_dict, 'base', num_base) + add_non_none_to_dict(result_dict, 'mult', base_multiplier) + add_non_none_to_dict(result_dict, 'script', script) + if num_type.startswith('other'): + add_non_none_to_dict(result_dict, 'name', name) + f_err.write(json.dumps(result_dict) + '\n') + n_err += 1 + else: + if not script: + add_non_none_to_dict(result_dict, 'name', name) + f_out.write(json.dumps(result_dict) + '\n') + n_out += 1 + sys.stderr.write(f'Processed {codepoint} codepoints,\n wrote {n_out} lines to {out_filename}\n' + f' and {n_err} lines to {err_filename}\n') + + def load_resource_files(self, data_dir: Path, load_log: bool = False, + rebuild_ud_props: bool = False, rebuild_num_props: bool = False): + """Loads all resource files needed for romanization.""" + data_dir = data_dir + if not isinstance(data_dir, pathlib.Path): + sys.stderr.write(f'Error: data_dir is of {type(data_dir)}, not a Path.\n' + f' Cannot load any resource files.\n') + return + self.load_rom_file(os.path.join(data_dir, "romanization-auto-table.txt"), + 'ud', file_format='rom', load_log=load_log) + self.load_rom_file(os.path.join(data_dir, "UnicodeDataOverwrite.txt"), + 'ow', file_format='u2r', load_log=load_log) + self.load_rom_file(os.path.join(data_dir, "romanization-table.txt"), + 'man', file_format='rom', load_log=load_log) + self.load_chinese_pinyin_file(os.path.join(data_dir, "Chinese_to_Pinyin.txt"), load_log=load_log) + self.load_script_file(os.path.join(data_dir, "Scripts.txt"), load_log=load_log) + self.load_num_props(os.path.join(data_dir, "NumProps.jsonl"), load_log=load_log) + for base_file in ("UnicodeDataProps.txt", "UnicodeDataPropsCJK.txt", "UnicodeDataPropsHangul.txt"): + self.load_unicode_data_props(os.path.join(data_dir, base_file), load_log=load_log) + if rebuild_ud_props: + self.rebuild_unicode_data_props(os.path.join(data_dir, "UnicodeDataProps.txt"), + cjk=os.path.join(data_dir, "UnicodeDataPropsCJK.txt"), + hangul=os.path.join(data_dir, "UnicodeDataPropsHangul.txt")) + if rebuild_num_props: + self.rebuild_num_props(os.path.join(data_dir, "NumProps.jsonl"), + os.path.join(data_dir, "NumPropsRejects.jsonl")) + + def unicode_hangul_romanization(self, s: str, pass_through_p: bool = False): + """Special algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet.""" + if cached_rom := self.hangul_rom.get(s, None): + return cached_rom + leads = "g gg n d dd r m b bb s ss - j jj c k t p h".split() + vowels = "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i".split() + tails = "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h".split() + result = "" + for c in s: + cp = ord(c) + if 0xAC00 <= cp <= 0xD7A3: + code = cp - 0xAC00 + lead_index = int(code / (28 * 21)) + vowel_index = int(code / 28) % 21 + tail_index = code % 28 + rom = leads[lead_index] + vowels[vowel_index] + tails[tail_index] + rom = rom.replace('-', '') + self.hangul_rom[c] = rom + result += rom + elif pass_through_p: + result += c + return result + + @staticmethod + def char_is_nonspacing_mark(s) -> bool: + """ Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signs""" + return (len(s) == 1) and (ud.category(s) == 'Mn') + + @staticmethod + def char_is_format_char(s) -> bool: + """ Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joiner""" + return (len(s) == 1) and (ud.category(s) == 'Cf') + + @staticmethod + def char_is_space_separator(s) -> bool: + """ Checks whether a character is a space, + e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark + but excluding \t, \r, \n""" + return (len(s) == 1) and (ud.category(s) == 'Zs') + + def chr_name(self, char: str) -> str: + try: + return ud.name(char) + except (ValueError, TypeError): + if name := self.dict_str[('name', char)]: + return name + return '' + + def num_value(self, s: str) -> int | float | Fraction | None: + """rom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numerals""" + for rom_rule in self.rom_rules[s]: + if (num := rom_rule['num']) is not None: + return num + return None + + def rom_rule_value(self, s: str, key: str): + for rom_rule in self.rom_rules[s]: + if (value := rom_rule.get(key)) is not None: + return value + return None + + def unicode_float2fraction(self, num: float, precision: float = 0.000001) -> Tuple[int, int] | None: + """only for common unicode fractions""" + if chached_value := self.float2fraction.get(num, None): + return chached_value + for numerator in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11): + for denominator in (2, 3, 4, 5, 6, 8, 12, 16, 20, 32, 40, 64, 80, 160, 320): + if abs(numerator / denominator - num) < precision: + result = numerator, denominator + self.float2fraction[num] = result + return result + return None + + def chr_script_name(self, char: str) -> str: + """For letters, diacritics, numerals etc.""" + return self.dict_str[('script', char)] + + def test_output_of_selected_scripts_and_rom_rules(self): + """Low level test function that checks and displays romanization information.""" + output = '' + for s in ("Oriya", "Chinese"): + d = self.scripts[s.lower()] + output += f'SCRIPT {s} {d}\n' + for s in ('ƿ', 'β', 'и', 'μπ', '⠹', '亿', 'ちょ', 'и', '𓍧', '正', '分之', 'ऽ', 'ศ', 'ด์'): + d = self.rom_rules[s] + output += f'DICT {s} {d}\n' + for s in ('ƿ', 'β', 'न', 'ु'): + output += f'SCRIPT-NAME {s} {self.chr_script_name(s)}\n' + for s in ('万', '\uF8F7', '\U00013368', '\U0001308B', '\u0E48', '\u0E40'): + name = self.chr_name(s) + num = self.dict_num[s] + pic = self.dict_str[('pic', s)] + tone_mark = self.dict_str[('tone-mark', s)] + syllable_info = self.dict_str[('syllable-info', s)] + is_large_power = self.dict_bool[('is-large-power', s)] + output += f'PROPS {s}' + if name: + output += f' name: {name}' + if num: + output += f' num: {num} ({type(num).__name__})' + if pic: + output += f' pic: {pic}' + if tone_mark: + output += f' tone-mark: {tone_mark}' + if syllable_info: + output += f' syllable-info: {syllable_info}' + if is_large_power: + output += f' is-large-power: {is_large_power}' + output += '\n' + mayan12 = '\U0001D2EC' + egyptian600 = '𓍧' + runic90 = '𐍁' + klingon2 = '\uF8F2' + for offset, c in enumerate(f'9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵{runic90}{mayan12}{egyptian600}{klingon2}'): + output += f'NUM-EDGE: {NumEdge(offset, offset+1, c, self)}\n' + for s in ('\u00bc', '\u0968'): + output += f'NUM-PROPS: {self.num_props[s]}\n' + print(output) + + def test_romanization(self, **args): + """A few full cases of romanization testing.""" + tests = [('ألاسكا', None), ('यह एक अच्छा अनुवाद है.', 'hin'), ('ちょっとまってください', 'kor'), + ('Μπανγκαλόρ', 'ell'), ('Зеленський', 'ukr'), ('കേരളം', 'mal')] + for test in tests: + s = test[0] + lcode = test[1] if len(test) >= 2 else None + rom = self.romanize_string(s, lcode=lcode, **args) + sys.stderr.write(f'ROM {s} -> {rom}\n') + n_alerts = 0 + codepoint = -1 + while codepoint < 0xF0000: + codepoint += 1 + c = chr(codepoint) + rom = self.romanize_string(c) + if regex.search(r'\s', rom) and regex.search(r'\S', rom): + name = self.chr_name(c) + sys.stderr.write(f'U+{codepoint:04X} {c} {name} {rom}\n') + n_alerts += 1 + sys.stderr.write(f'{n_alerts} alerts for roms with spaces\n') + + def romanize_file(self, input_filename: str | None = None, output_filename: str | None = None, + lcode: str | None = None, direct_input: List[str] = None, **args): + """Script to apply romanization to an entire file. Input and output files needed. + Language code (lcode) recommended.""" + f_in_to_be_closed, f_out_to_be_closed = False, False + if direct_input and (input_filename is None): + f_in = direct_input # list of lines + elif isinstance(input_filename, str): + try: + f_in = open(input_filename) + f_in_to_be_closed = True + except OSError: + sys.stderr.write(f'Error in romanize_file: Cannot open file {input_filename}\n') + f_in = None + elif input_filename is None: + f_in = sys.stdin + else: + sys.stderr.write(f"Error in romanize_file: argument 'input_filename' {input_filename} " + f"is of wrong type: {type(input_filename)} (should be str)\n") + f_in = None + if isinstance(output_filename, str): + try: + f_out = open(str(output_filename), 'w') + f_out_to_be_closed = True + except OSError: + sys.stderr.write(f'Error in romanize_file: Cannot write to file {output_filename}\n') + f_out = None + elif output_filename is None: + f_out = sys.stdout + else: + sys.stderr.write(f"Error in romanize_file: argument 'output_filename' {output_filename} " + f"is of wrong type: {type(output_filename)} (should be str)\n") + f_out = None + if f_in and f_out: + max_lines = args.get('max_lines', None) + progress_dots_output = False + for line_number, line in enumerate(f_in, 1): + if m := regex.match(r'(::lcode\s+)([a-z]{3})(\s+)(.*?)\s*$', line): + lcode_kw, lcode2, space, snt = m.group(1, 2, 3, 4) + rom_result = self.romanize_string(snt, lcode2 or lcode, **args) + if args.get('rom_format', None) == RomFormat.STR: + lcode_prefix = f"{lcode_kw}{lcode2}{space}" + f_out.write(lcode_prefix + rom_result + '\n') + else: + lcode_prefix = f'[0, 0, "", "lcode: {lcode2}"]' # meta edge with lcode info + prefixed_edges = [lcode_prefix] + self.romanize_string(snt, lcode2 or lcode, **args) + f_out.write(Edge.json_str(prefixed_edges) + '\n') + else: + f_out.write(Edge.json_str(self.romanize_string(line.rstrip(), lcode, **args)) + '\n') + if not args.get('silent'): + if line_number % 100 == 0: + if line_number % 1000 == 0: + sys.stderr.write(str(line_number)) + else: + sys.stderr.write('.') + progress_dots_output = True + sys.stderr.flush() + gc.collect() + if max_lines and line_number >= max_lines: + break + if progress_dots_output: + sys.stderr.write('\n') + sys.stderr.flush() + if f_in_to_be_closed: + f_in.close() + if f_out_to_be_closed: + f_out.close() + + @staticmethod + def apply_any_offset_to_cached_rom_result(cached_rom_result: str | List[Edge], offset: int = 0) \ + -> str | List[Edge]: + if isinstance(cached_rom_result, str): + return cached_rom_result + elif offset == 0: + return cached_rom_result + else: + return [Edge(edge.start + offset, edge.end + offset, edge.txt, edge.type) for edge in cached_rom_result] + + def romanize_string_core(self, s: str, lcode: str | None, rom_format: RomFormat, cache_p: bool, + offset: int = 0, **args) -> str | List[Edge]: + """Script to support token-by-token romanization with caching for higher speed.""" + if cache_p: + cached_rom = self.rom_cache.get((s, lcode, rom_format), None) + if cached_rom is not None: + return self.apply_any_offset_to_cached_rom_result(cached_rom, offset) + lat = Lattice(s, uroman=self, lcode=lcode) + lat.pick_tibetan_vowel_edge(**args) + lat.prep_braille(**args) + lat.add_romanization(**args) + lat.add_numbers(self, **args) + lat.add_braille_numbers(**args) + lat.add_rom_fall_back_singles(**args) + if rom_format == RomFormat.LATTICE: + all_edges = lat.all_edges(0, len(s)) + lat.add_alternatives(all_edges) + if cache_p: + self.rom_cache[(s, lcode, rom_format)] = all_edges + result = self.apply_any_offset_to_cached_rom_result(all_edges, offset) + else: + best_edges = lat.best_rom_edge_path(0, len(s)) + if rom_format in (RomFormat.EDGES, RomFormat.ALTS): + if rom_format == RomFormat.ALTS: + lat.add_alternatives(best_edges) + if cache_p: + self.rom_cache[(s, lcode, rom_format)] = best_edges + result = self.apply_any_offset_to_cached_rom_result(best_edges, offset) + else: + rom = lat.edge_path_to_surf(best_edges) + del lat + if cache_p: + self.rom_cache[(s, lcode, rom_format)] = rom + result = rom + return result + + def romanize_string(self, s: str, lcode: str | None = None, rom_format: RomFormat = RomFormat.STR, **args) \ + -> str | List[Edge]: + """Main entry point for romanizing a string. Recommended argument: lcode (language code). + recursive only used for development. + Method returns a string or a list of edges (with start and end offsets).""" + lcode = lcode or args.get('lcode', None) + # print('rom::', s, 'lcode:', lcode, 'print-lattice:', print_lattice_p) + + # with caching (for string format output only for now) + if cache_p := not args.get('no_caching', False): + rest, offset = s, 0 + result = '' if rom_format == RomFormat.STR else [] + while m3 := regex.match(r'(.*?)([.,; ]*[ 。][.,; ]*)(.*)$', rest): + pre, delimiter, rest = m3.group(1, 2, 3) + result += self.romanize_string_core(pre, lcode, rom_format, cache_p, offset, **args) + offset += len(pre) + result += self.romanize_string_core(delimiter, lcode, rom_format, cache_p, offset, **args) + offset += len(delimiter) + result += self.romanize_string_core(rest, lcode, rom_format, cache_p, offset, **args) + return result + else: + return self.romanize_string_core(s, lcode, rom_format, cache_p, 0, **args) + + +class Edge: + """This class defines edges that span part of a sentence with a specific romanization. + There might be multiple edges for a given span. The edges in turn are part of the + romanization lattice.""" + def __init__(self, start: int, end: int, s: str, annotation: str = None): + self.start = start + self.end = end + self.txt = s + self.type = annotation + + def __str__(self): + return f'[{self.start}-{self.end}] {self.txt} ({self.type})' + + def __repr__(self): + return str(self) + + def json(self) -> str: # start - end - text - annotation + return json.dumps([self.start, self.end, self.txt, self.type]) + + @staticmethod + def json_str(rom_result: List[Edge] | str) -> str: + if isinstance(rom_result, str): + return rom_result + else: + result = '[' + for edge in rom_result: + if isinstance(edge, Edge): + result += edge.json() + else: + result += str(edge) + result += ']' + return result + + +class NumEdge(Edge): + def __init__(self, start: int, end: int, s: str, uroman: Uroman | None, active: bool = False): + """For NumEdge, the s argument is in original language (not yet romanized).""" + # For speed, much of this processing should at some point be cached in data files. + Edge.__init__(self, start, end, s) + self.orig_txt, self.txt = s, s + self.value, self.fraction, self.num_base, self.base_multiplier = None, None, None, None + self.type, self.script, self.is_large_power, self.active = None, None, False, active + self.n_decimals = None + self.value_s = None # precision for 3.14159265358979323846264338327950288419716939937510582097494 + if start+1 == end: + char = s[0] + if d := uroman.num_props.get(char): + self.active = True + self.value = d.get('value') + fraction_list = d.get('fraction') + self.fraction = Fraction(fraction_list[0], fraction_list[1]) if fraction_list else None + self.num_base = d.get('base') + self.base_multiplier = d.get('mult') + self.type = d.get('type') + self.script = d.get('script') + self.is_large_power = d.get('is-large-power') + self.update() + + def update(self, + value: int | float | None = None, + value_s: str | None = None, + fraction: Fraction | None = None, + n_decimals: int | None = None, + num_base: int | None = None, + base_multiplier: int | float | None = None, + script: str | None = None, + e_type: str | None = None, + orig_txt: str | None = None) -> str: + self.value = first_non_none(value, self.value) + self.value_s = first_non_none(value_s, self.value_s) + self.fraction = first_non_none(fraction, self.fraction) + self.n_decimals = first_non_none(n_decimals, self.n_decimals) + self.num_base = first_non_none(num_base, self.num_base) + self.base_multiplier = first_non_none(base_multiplier, self.base_multiplier) + self.script = first_non_none(script, self.script) + self.type = first_non_none(e_type, self.type) + self.orig_txt = first_non_none(orig_txt, self.orig_txt) + if self.value_s is not None: + value_s = self.value_s + elif self.value is None: + value_s = '' + elif isinstance(self.value, float) and (self.n_decimals is not None): + value_s = first_non_none(self.value_s, f'{self.value:0.{self.n_decimals}f}') + else: + value_s = str(self.value) + fraction_s = '' if self.fraction is None else f'{self.fraction.numerator}/{self.fraction.denominator}' + delimiter_s = ' ' if value_s and fraction_s else '' + self.txt = (value_s + delimiter_s + fraction_s) or self.orig_txt + return self.txt + + def __str__(self): + if self.num_base is not None: + if self.base_multiplier is not None: + b_clause = f'{self.base_multiplier}*{self.num_base}' + else: + b_clause = str(self.num_base) + else: + b_clause = None + return (('' if self.active else ' *') + + f'[{self.start}-{self.end}] {self.orig_txt} R:{self.txt} T:{self.type}' + + (' LP' if self.is_large_power else '') + + (f' B:{b_clause}' if (b_clause is not None) else '') + + (f' V:{self.value}' if ((self.value is not None) and (str(self.value) != self.txt)) else '') + + (f' VS:{self.value_s}' if ((self.value_s is not None) and (self.value_s != self.txt)) else '') + + (f' F:.{self.n_decimals}f' if self.n_decimals else f'') + + (f' S:{self.script}' if self.script else '')) + + +class Lattice: + """Lattice for a specific romanization instance. Has edges.""" + def __init__(self, s: str, uroman: Uroman, lcode: str = None): + self.s = s + self.lcode = lcode + self.lattice = defaultdict(set) + self.max_vertex = len(s) + self.uroman = uroman + self.props = {} + self.simple_top_rom_cache = {} + self.contains_script = defaultdict(bool) + self.check_for_scripts() + + def check_for_scripts(self): + for c in self.s: + script_name = self.uroman.chr_script_name(c) + self.contains_script[script_name] = True + if regex.search(r'[\u2800-\u28FF]', self.s): + self.contains_script['Braille'] = True + + def add_edge(self, edge: Edge): + self.lattice[(edge.start, edge.end)].add(edge) + self.lattice[(edge.start, 'right')].add(edge.end) + self.lattice[(edge.end, 'left')].add(edge.start) + + def __str__(self): + edges = [] + for start in range(self.max_vertex): + for end in self.lattice[(start, 'right')]: + for edge in self.lattice[(start, end)]: + edges.append(f'[{start}-{end}] {edge.txt} ({edge.type})') + return ' '.join(edges) + + @staticmethod + def char_is_braille(c: str) -> bool: + return 0x2800 <= ord(c[0]) <= 0x28FF + + # Help Tibet + def char_is_subjoined_letter(self, c: str) -> bool: + return "SUBJOINED LETTER" in self.uroman.chr_name(c) + + def char_is_regular_letter(self, c: str) -> bool: + char_name = self.uroman.chr_name(c) + return ("LETTER" in char_name) and not ("SUBJOINED" in char_name) + + def char_is_letter(self, c: str) -> bool: + return "LETTER" in self.uroman.chr_name(c) + + def char_is_vowel_sign(self, c: str) -> bool: + return self.uroman.dict_bool[('is-vowel-sign', c)] + + def char_is_letter_or_vowel_sign(self, c: str) -> bool: + return self.char_is_letter(c) or self.char_is_vowel_sign(c) + + def is_at_start_of_word(self, position: int) -> bool: + # return not regex.match(r'(?:\pL|\pM)', self.s[position-1:position]) + first_char = self.s[position] + first_char_is_braille = self.char_is_braille(first_char) + end = position + if (preceded_by_alpha := self.props.get(('preceded_by_alpha', end), None)) in (True, False): + return not preceded_by_alpha + for start in self.lattice[(end, 'left')]: + for edge in self.lattice[(start, end)]: + prev_letter = None if edge.txt == '' else edge.txt[-1] + if len(edge.txt) and (prev_letter.isalpha() or (first_char_is_braille and (prev_letter in ["'"]))): + self.props[('preceded_by_alpha', position)] = True + return False + self.props[('preceded_by_alpha', position)] = False + return True + + def is_at_end_of_word(self, position: int) -> bool: + if (cached_followed_by_alpha := self.props.get(('followed_by_alpha', position), None)) in (True, False): + return not cached_followed_by_alpha + start = position + while (start+1 < self.max_vertex) \ + and self.uroman.char_is_nonspacing_mark(self.s[start]) \ + and ('NUKTA' in self.uroman.chr_name(self.s[start])): + start += 1 + for end in range(start + 1, self.max_vertex + 1): + s = self.s[start:end] + if not self.uroman.dict_bool[('s-prefix', s)]: + break + for rom_rule in self.uroman.rom_rules[s]: + rom = rom_rule['t'] + if (not rom_rule['use-only-at-start-of-word']) and regex.search(r'\pL', rom): + self.props[('followed_by_alpha', position)] = True + return False + self.props[('followed_by_alpha', position)] = False + return True + + def is_at_end_of_syllable(self, position: int) -> Tuple[bool, str]: + """At least initially for Thai""" + prev_char = self.s[position-2] if position >= 2 else None + # char = self.s[position-1] if position >= 1 else None + next_char = self.s[position] if position < self.max_vertex else None + if self.uroman.dict_str[('tone-mark', next_char)]: + adj_position = position + 1 + next_char = self.s[adj_position] if adj_position < self.max_vertex else None + # print('TONE-MARK', position, next_char) + else: + adj_position = position + next_char2 = self.s[adj_position + 1] if adj_position + 1 < self.max_vertex else None + if prev_char is None: + return False, 'start-of-string' + if not regex.search(r'(?:\pL|\pM)$', prev_char): # start of token + return False, 'start-of-token' + if self.uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant': + return False, 'pre-post-vowel-on-left' + if self.uroman.dict_str[('syllable-info', next_char)] == 'written-pre-consonant-spoken-post-consonant': + return True, 'pre-post-vowel-on-right' + if adj_position >= self.max_vertex: # end of string + return True, 'end-of-string' + # if not self.char_is_letter_or_vowel_sign(next_char): # end of token + if not regex.match(r'(?:\pL|\pM)', next_char): # end of token + return True, 'end-of-token' + if position > 0: + left_edge = self.best_left_neighbor_edge(position-1) + if left_edge and regex.search(r'[bcdfghjklmnpqrstvxz]$', left_edge.txt): + return False, 'consonant-to-the-left' + next_char_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position, + adj_position + 2, + simple_search=True), + self.simple_top_romanization_candidate_for_span(adj_position, + adj_position + 1, + simple_search=True), + "?") + if not regex.match(r"[aeiou]", next_char_rom.lower()): # followed by consonant + return True, f'not-followed-by-vowel {next_char_rom}' + if (next_char == '\u0E2D') and (next_char2 is not None): # THAI CHARACTER O ANG + next_char2_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position+1, + adj_position+2, + simple_search=True), + "?") + if regex.match(r"[aeiou]", next_char2_rom.lower()): + return True, 'o-ang-followed-by-vowel' # In that context Thai char. "o ang" is considered a consonant + return False, 'not-at-syllable-end-by-default' + + def romanization_by_first_rule(self, s) -> str | None: + try: + return self.uroman.rom_rules[s][0]['t'] + except IndexError: + return None + + def expand_rom_with_special_chars(self, rom: str, start: int, end: int, **args) \ + -> Tuple[str, int, int, str | None]: + """This method contains a number of special romanization heuristics that typically modify + an existing or preliminary edge based on context.""" + orig_start = start + uroman = self.uroman + full_string = self.s + annot = None + if rom == '': + return rom, start, end, None + prev_char = (full_string[start-1] if start >= 1 else '') + first_char = full_string[start] + last_char = full_string[end-1] + next_char = (full_string[end] if end < len(full_string) else '') + # \u2820 is the Braille character indicating that the next letter is upper case + if (prev_char == '\u2820') and regex.match(r'[a-z]', rom): + return rom[0].upper() + rom[1:], start-1, end, 'rom exp' + # Normalize multi-upper case THessalonike -> Thessalonike, but don't change THESSALONIKE + if start+1 == end and rom.isupper() and next_char.islower(): + ablation = args.get('ablation', '') # VERBOSE + if not ('nocap' in ablation): + rom = rom.capitalize() + # Japanese small tsu (and Gurmukhi addak) used as consonant doubler: + if (prev_char and prev_char in 'っッ\u0A71') \ + and (uroman.chr_script_name(prev_char) == uroman.chr_script_name(prev_char)) \ + and (m_double_consonant := regex.match(r'(ch|[bcdfghjklmnpqrstwz])', rom)): + # return m_double_consonant.group(1).replace('ch', 't') + rom, start-1, end, 'rom exp' + # expansion might additional apply to the right + if prev_char in 'っッ': # for Japanese, per Hepburn, use tch + rom = m_double_consonant.group(1).replace('ch', 't') + rom + else: + rom = m_double_consonant.group(1).replace('ch', 'c') + rom + start = start-1 + first_char = full_string[start] + prev_char = (full_string[start-1] if start >= 1 else '') + # Thai + if uroman.chr_script_name(first_char) == 'Thai': + if (start+1 == end) and regex.match(r'[bcdfghjklmnpqrstvwxyz]+$', rom): + if uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant': + for vowel_prefix_len in [1]: + if vowel_prefix_len <= start: + for vowel_suffix_len in [3, 2, 1]: + if end + vowel_suffix_len <= len(full_string): + pattern = (full_string[start-vowel_prefix_len: start] + + '–' + + full_string[end:end+vowel_suffix_len]) + if uroman.rom_rules[pattern]: + vowel_rom_rule = uroman.rom_rules[pattern][0] + vowel_rom = vowel_rom_rule['t'] + # print(f" PATTERN {pattern} ({full_string[start:end]}/{rom}) {rom}{vowel_rom}") + return rom + vowel_rom, start-vowel_prefix_len, end+vowel_suffix_len, 'rom exp' + if (uroman.chr_script_name(prev_char) == 'Thai') \ + and (uroman.dict_str[('syllable-info', prev_char)] + == 'written-pre-consonant-spoken-post-consonant') \ + and regex.match(r'[bcdfghjklmnpqrstvwxyz]', rom) \ + and (vowel_rom := self.romanization_by_first_rule(prev_char)): + return rom + vowel_rom, start-1, end, 'rom exp' + # THAI CHARACTER O ANG + if (first_char == '\u0E2D') and (end - start == 1): + prev_script = uroman.chr_script_name(prev_char) + next_script = uroman.chr_script_name(next_char) + prev_rom = self.find_rom_edge_path_backwards(0, start, 1, return_str=True) + next_rom = self.romanization_by_first_rule(next_char) + # if not recursive: + # lc = uroman.romanize_string(full_string[:start], lcode=self.lcode, recursive=True) + # rc = uroman.romanize_string(full_string[end:], lcode=self.lcode, recursive=True) + # print('PP', start, end, prev_script, next_script, prev_rom, next_rom, ' LC:', lc[-40:], + # ' RC:', rc[:40]) + # delete THAI CHARACTER O ANG unless it is surrounded on both sides by a Thai consonant + if not ((prev_script == 'Thai') and (next_script == 'Thai') + and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', prev_rom) + and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', next_rom)): + # if not recursive: + # print(f'* DELETE O ANG {first_char} {start}-{end} LC: {lc[-40:]} RC: {rc[:40]}') + return '', start, end, 'rom del' + # Coptic: consonant + grace-accent = e + consonant + if next_char and (next_char == "\u0300") and (uroman.chr_script_name(last_char) == "Coptic")\ + and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)): + rom = 'e' + rom + end = end+1 + last_char = full_string[end - 1] + next_char = (full_string[end] if end < len(full_string) else '') + annot = 'rom exp' + # Japanese small y: ki + small ya = kya etc. + if (next_char and next_char in 'ゃゅょャュョ') \ + and (uroman.chr_script_name(last_char) == uroman.chr_script_name(next_char)) \ + and regex.search(r'([bcdfghjklmnpqrstvwxyz]i$)', rom) \ + and (y_rom := self.romanization_by_first_rule(next_char)) \ + and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)) \ + and (not self.simple_top_romanization_candidate_for_span(start, end+1)): + rom = rom[:-1] + y_rom + end = end+1 + last_char = full_string[end - 1] + next_char = (full_string[end] if end < len(full_string) else '') + annot = 'rom exp' + # Japanese vowel lengthener (U+30FC) + last_rom_char = last_chr(rom) + if (next_char == 'ー') \ + and (uroman.chr_script_name(last_char) in ('Hiragana', 'Katakana')) \ + and (last_rom_char in 'aeiou'): + return rom + last_rom_char, start, end+1, 'rom exp' + # Virama (in Indian languages) + if self.uroman.dict_bool[('is-virama', next_char)]: + return rom, start, end + 1, "rom exp" + if rom.startswith(' ') and ((start == 0) or (prev_char == ' ')): + rom = rom[1:] + if rom.endswith(' ') and ((end == len(full_string)+1) or (next_char == ' ')): + rom = rom[:-1] + return rom, start, end, annot + + def prep_braille(self, **_args) -> None: + if self.contains_script['Braille']: + dots6 = '\u2820' # characters in following word are upper case + all_caps = False + for i, c in enumerate(self.s): + if (i >= 1) and (self.s[i-1] == dots6) and (c == dots6): + all_caps = True + elif all_caps: + if c in '\u2800': # Braille space + all_caps = False + else: + self.props[('is-upper', i)] = True + + def pick_tibetan_vowel_edge(self, **args) -> None: + if not self.contains_script['Tibetan']: + return None + verbose = bool(args.get('verbose')) + s = self.s + uroman = self.uroman + tibetan_syllable = [] + tibetan_letter_positions = [] + for start in range(self.max_vertex): + c = s[start] + if (uroman.chr_script_name(c) == 'Tibetan') and self.char_is_letter_or_vowel_sign(c): + tibetan_letter_positions.append(start) + else: + if tibetan_letter_positions: + tibetan_syllable.append(tibetan_letter_positions) + tibetan_letter_positions = [] + if tibetan_letter_positions: + tibetan_syllable.append(tibetan_letter_positions) + for tibetan_letter_positions in tibetan_syllable: + vowel_pos = None + orig_txt = '' + roms = [] + subjoined_letter_positions = [] + first_letter_position = tibetan_letter_positions[0] + for i in tibetan_letter_positions: + c = s[i] + orig_txt += c + rom = first_non_none(self.simple_top_romanization_candidate_for_span(i, i+1), "?") + self.props[('edge-vowel', i)] = None + if self.char_is_vowel_sign(c) or (rom and regex.match(r"[aeiou]+$", rom)): + vowel_pos = i + self.props[('edge-vowel', i)] = True + # delete any syllable initial ' before vowel + if roms == ["'"]: + self.props[('edge-delete', i-1)] = True + elif self.char_is_subjoined_letter(c): + subjoined_letter_positions.append(i) + if i > first_letter_position: + if c == "\u0FB0": + vowel_pos = i-1 + self.props[('edge-vowel', i-1)] = True + else: + self.props[('edge-vowel', i-1)] = False + rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom) + elif c == "\u0F60": # Tibetan letter -a (') + self.props[('edge-vowel', i)] = False + if i > first_letter_position: + vowel_pos = i-1 + self.props[('edge-vowel', i-1)] = True + if i == tibetan_letter_positions[-1]: + self.props[('edge-delete', i)] = True + if roms and not (roms[-1] in "aeiou"): + rom = "a'" + else: + rom = "'" + else: + rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom) + roms.append(rom) + if vowel_pos is not None: + for i in tibetan_letter_positions: + if self.props.get(('edge-vowel', i)) is None: + self.props[('edge-vowel', i)] = False + else: + best_cost, best_vowel_pos, best_pre, best_post = math.inf, None, None, None + n_letters = len(tibetan_letter_positions) + for i in tibetan_letter_positions: + rel_pos = i - first_letter_position + pre, post = ''.join(roms[:rel_pos+1]), ''.join(roms[rel_pos+1:]) + if self.props.get(('edge-vowel', i)) is False: + cost = 20 + if cost < best_cost: + best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post + elif n_letters == 1: + cost = 0 + if cost < best_cost: + best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post + elif n_letters == 2: + cost = 0 if i == 0 else 0.1 + if cost < best_cost: + best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post + else: + good_suffix = regex.match(r"(?:|[bcdfghjklmnpqrstvwxz]|bh|bs|ch|cs|dd|ddh|" + r"dh|dz|dzh|gh|gr|gs|kh|khs|kss|n|nn|nt|ms|ng|ngs|ns|ph|" + r"rm|sh|ss|th|ts|tsh|tt|tth|zh|zhs)'?$", post) + good_prefix = regex.match(r"'?(?:.|bd|br|brg|brgy|bs|bsh|bst|bt|bts|by|bz|bzh|" + r"ch|db|dby|dk|dm|dp|dpy|dr|" + r"gl|gn|gr|gs|gt|gy|gzh|kh|khr|khy|kr|ky|ld|lh|lt|mkh|mny|mth|mtsh|" + r"ny|ph|phr|phy|rgy|rk|el|rn|rny|rt|rts|" + r"sk|skr|sky|sl|sm|sn|sny|sp|spy|sr|st|th|ts|tsh)$", pre) + subjoined_suffix = all([x in subjoined_letter_positions + for x in tibetan_letter_positions[rel_pos+2:]]) + # print('GOOD', good_suffix, good_prefix, subjoined_suffix, f'{pre}a{post}', + # subjoined_letter_positions, tibetan_letter_positions[rel_pos+2:]) + if good_suffix and good_prefix: + cost = len(pre) * 0.1 + elif good_suffix: + cost = len(pre) + elif subjoined_suffix and good_prefix: + cost = len(pre) * 0.3 + elif subjoined_suffix: + cost = len(pre) * 0.5 + else: + cost = math.inf + if cost < best_cost: + best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post + if best_vowel_pos is not None: + for i in tibetan_letter_positions: + if self.props.get(('edge-vowel', i)) is None: + value = (i == best_vowel_pos) + self.props[('edge-vowel', i)] = value + if verbose: + best_cost = best_cost if isinstance(best_cost, int) else round(best_cost, 2) + sys.stderr.write(f'Tib. best cost: "{best_pre}a{best_post}" o:{orig_txt} c:{round(best_cost, 2)}' + f' p:{best_vowel_pos} {tibetan_letter_positions}\n') + + def add_default_abugida_vowel(self, rom: str, start: int, end: int, annotation: str = '') -> str: + """Adds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia.""" + uroman = self.uroman + s = self.s + try: + first_s_char = s[start] + last_s_char = s[end-1] + script_name = uroman.chr_script_name(first_s_char) + script = self.uroman.scripts[script_name.lower()] + if not (abugida_default_vowels := script['abugida-default-vowels']): + return rom + key = (script, rom) + if key in uroman.abugida_cache: + base_rom, base_rom_plus_vowel, mod_rom = uroman.abugida_cache[key] + rom = mod_rom + else: + vowels_regex1 = '|'.join(abugida_default_vowels) # e.g. 'a' or 'a|o' + vowels_regex2 = '|'.join(map(lambda x: x + '+', abugida_default_vowels)) # e.g. 'a+' or 'a+|o+' + if m := regex.match(fr'([cfghkmnqrstxy]?y)({vowels_regex2})-?$', rom): + base_rom = m.group(1) + base_rom_plus_vowel = base_rom + m.group(2) + elif m := regex.match(fr'([bcdfghjklmnpqrstvwxyz]+)({vowels_regex1})-?$', rom): + base_rom = m.group(1) + base_rom_plus_vowel = base_rom + m.group(2) + if rom.endswith('-') and (start+1 == end) and rom[0].isalpha(): + rom = rom[:-1] + else: + base_rom = rom + base_rom_plus_vowel = base_rom + abugida_default_vowels[0] + if (not regex.match(r"[bcdfghjklmnpqrstvwxyz]+$", base_rom) + and (not ((script_name == 'Tibetan') and (base_rom == "'")))): + base_rom, base_rom_plus_vowel = None, None + uroman.abugida_cache[key] = (base_rom, base_rom_plus_vowel, rom) + if base_rom is None: + return rom + if 'tail' in annotation: + return rom + prev_s_char = s[start-1] if start >= 1 else '' + next_s_char = s[end] if len(s) > end else '' + next2_s_char = s[end+1] if len(s) > end+1 else '' + if script_name == 'Tibetan': + if self.props.get(('edge-delete', start)): + return '' + elif self.props.get(('edge-vowel', start)): + return base_rom_plus_vowel + else: + return base_rom + if (next_s_char and ((base_rom in "bcdfghklmnpqrstvwz") or (base_rom in ["ng"])) + and (next_s_char in "យ")): # Khmer yo + return base_rom + if self.uroman.dict_bool[('is-vowel-sign', next_s_char)]: + return base_rom + if self.uroman.dict_bool[('is-medial-consonant-sign', next_s_char)]: + return base_rom + if self.char_is_subjoined_letter(next_s_char): + return base_rom + if self.uroman.char_is_nonspacing_mark(next_s_char) \ + and self.uroman.dict_bool[('is-vowel-sign', next2_s_char)]: + return base_rom + if self.uroman.dict_bool[('is-virama', next_s_char)]: + return base_rom + if self.uroman.char_is_nonspacing_mark(next_s_char) \ + and self.uroman.dict_bool[('is-virama', next2_s_char)]: + return base_rom + if self.uroman.dict_bool[('is-virama', prev_s_char)]: + return base_rom_plus_vowel + if self.is_at_start_of_word(start) and not regex.search('r[aeiou]', rom): + return base_rom_plus_vowel + # delete many final schwas from most Devanagari languages (except: Sanskrit) + if self.is_at_end_of_word(end): + if (script_name in ("Devanagari",)) and (self.lcode not in ('san',)): # Sanskrit + return rom + else: + return base_rom_plus_vowel + if uroman.chr_script_name(prev_s_char) != script_name: + return base_rom_plus_vowel + if 'VOCALIC' in self.uroman.chr_name(last_s_char): + return base_rom + if uroman.chr_script_name(next_s_char) == script_name: + return base_rom_plus_vowel + except Exception: + return rom + else: + pass + # print('ABUGIDA', rom, start, script_name, script, abugida_default_vowels, prev_s_char, next_s_char) + return rom + + def cand_is_valid(self, rom_rule: RomRule, start: int, end: int, rom: str) -> bool: + if rom is None: + return False + if rom_rule['dont-use-at-start-of-word'] and self.is_at_start_of_word(start): + return False + if rom_rule['use-only-at-start-of-word'] and not self.is_at_start_of_word(start): + return False + if rom_rule['dont-use-at-end-of-word'] and self.is_at_end_of_word(end): + return False + if rom_rule['use-only-at-end-of-word'] and not self.is_at_end_of_word(end): + return False + if rom_rule['use-only-for-whole-word'] \ + and not (self.is_at_start_of_word(start) and self.is_at_end_of_word(end)): + return False + if (lcodes := rom_rule['lcodes']) and (self.lcode not in lcodes): + return False + return True + + # @profile + def simple_sorted_romanization_candidates_for_span(self, start, end) -> List[str]: + s = self.s[start:end] + if not self.uroman.dict_bool[('s-prefix', s)]: + return [] + rom_rule_candidates = [] + for rom_rule in self.uroman.rom_rules[s]: + rom = rom_rule['t'] + if self.cand_is_valid(rom_rule, start, end, rom): + rom_rule_candidates.append((rom_rule['n-restr'] or 0, rom_rule['t'])) + rom_rule_candidates.sort(reverse=True) + return [x[1] for x in rom_rule_candidates] + + def simple_top_romanization_candidate_for_span(self, start, end, simple_search: bool = False) -> str | None: + if (start < 0) or (end > self.max_vertex): + return None + span_range = (start, end) + if (cached_result := self.simple_top_rom_cache.get(span_range)) is not None: + return cached_result + best_cand, best_n_restr, best_rom_rule = None, None, None + for rom_rule in self.uroman.rom_rules[self.s[start:end]]: + if self.cand_is_valid(rom_rule, start, end, rom_rule['t']): + n_restr = rom_rule['n-restr'] or 0 + if best_n_restr is None or (n_restr > best_n_restr): + best_cand, best_n_restr, best_rom_rule = rom_rule['t'], n_restr, rom_rule + if simple_search: + return best_cand + if best_rom_rule: + t_at_end_of_syllable = best_rom_rule['t-at-end-of-syllable'] + if t_at_end_of_syllable is not None: + is_at_end_of_syllable, rationale = self.is_at_end_of_syllable(end) + if is_at_end_of_syllable: + best_cand = t_at_end_of_syllable + # print(f" SIMPLE {start}-{end} {best_cand} ({best_rom_rule['t']},{t_at_end_of_syllable}) " + # f"END:{is_at_end_of_syllable} ({rationale})") + self.simple_top_rom_cache[span_range] = best_cand + # if (best_rom_rule is not None) and ('cancel' in (prov := best_rom_rule['prov'])): + # sys.stderr.write(f' Cancel {self.s} ({start}-{end}) {prov} {self.s[start:end]}\n') + return best_cand + + def decomp_rom(self, char_position: int) -> str | None: + """Input: decomposable character such as ﻼ or ½ + Output: la or 1/2""" + full_string = self.s + char = full_string[char_position] + rom = None + if ud_decomp_s := ud.decomposition(char): + format_comps = [] + other_comps = [] + decomp_s = '' + # name = self.uroman.chr_name(char) + for ud_decomp_elem in ud_decomp_s.split(): + if ud_decomp_elem.startswith("<"): + format_comps.append(ud_decomp_elem) + else: + try: + norm_char = chr(int(ud_decomp_elem, 16)) + except ValueError: + other_comps.append(ud_decomp_elem) + else: + decomp_s += norm_char + if (format_comps and (format_comps[0] not in ('', '', '', '')) + and (not other_comps) and decomp_s): + rom = self.uroman.romanize_string(decomp_s, self.lcode) + # make sure to add a space for 23½ -> 23 1/2 + if rom and ud.numeric(char, None): + rom = rom.replace('⁄', '/') + if char_position >= 1 and ud.numeric(full_string[char_position-1], None): + rom = ' ' + rom + if (char_position+1 < len(full_string)) and ud.numeric(full_string[char_position+1], None): + rom += ' ' + return rom + + def add_romanization(self, **args): + """Adds a romanization edge to the romanization lattice.""" + for start in range(self.max_vertex): + for end in range(start+1, self.max_vertex+1): + if not self.uroman.dict_bool[('s-prefix', self.s[start:end])]: + break + if (rom := self.simple_top_romanization_candidate_for_span(start, end)) is not None: + if self.contains_script['Braille'] and (start+1 == end): + if self.props.get(('is-upper', start)): + rom = rom.upper() + edge_annotation = 'rom' + if regex.match(r'\+(m|ng|n|h|r)', rom): + rom, edge_annotation = rom[1:], 'rom tail' + rom = self.add_default_abugida_vowel(rom, start, end, annotation=edge_annotation) + # orig_rom, orig_start, orig_end = rom, start, end + rom, start2, end2, exp_edge_annotation \ + = self.expand_rom_with_special_chars(rom, start, end, annotation=edge_annotation, + recursive=args.get('recursive', False), **args) + edge_annotation = exp_edge_annotation or edge_annotation + # if (orig_rom, orig_start, orig_end) != (rom, start, end): + # print(f'EXP {s} {orig_rom} {orig_start}-{orig_end} -> {rom} {start}-{end}') + # if rom != rom_orig: print('** Add ABUGIDA', rom, start, end, rom2) + self.add_edge(Edge(start2, end2, rom, edge_annotation)) + if start < len(self.s): + char = self.s[start] + cp = ord(char) + # Korean Hangul characters + if 0xAC00 <= cp <= 0xD7A3: + if rom := self.uroman.unicode_hangul_romanization(char): + self.add_edge(Edge(start, start+1, rom, 'rom')) + # character decomposition + if rom_decomp := self.decomp_rom(start): + self.add_edge(Edge(start, start + 1, rom_decomp, 'rom decomp')) + + @staticmethod + def update_edge_list(edges, new_edge, old_edges) -> List[NumEdge]: + new_edge_not_yet_added = True + result = [] + for edge in edges: + if edge in old_edges: + edge.active = False + if new_edge_not_yet_added: + result.append(new_edge) + new_edge_not_yet_added = False + else: + result.append(edge) + if new_edge_not_yet_added: + result.append(new_edge) + return result + + @staticmethod + def edge_is_digit(edge: Edge | None) -> bool: + return (isinstance(edge, NumEdge) + and (edge.value is not None) + and isinstance(edge.value, int) + and (edge.type == 'digit') + and (0 <= edge.value <= 9) + and (edge.end - edge.start == 1)) + + @staticmethod + def is_gap_null_edge(edge: Edge) -> bool: + return isinstance(edge, NumEdge) and (edge.orig_txt in ('零', '〇')) + + @staticmethod + def braille_digit(char: str) -> str | None: + position = '\u281A\u2801\u2803\u2809\u2819\u2811\u280B\u281B\u2813\u280A'.find(char) # Braille 0-9 + return str(position) if position >= 0 else None + + def add_braille_number(self, start: int, end: int, txt: str, **_args) -> None: + new_edge = NumEdge(start, end, txt, self.uroman) + new_edge.type = 'number' + self.add_edge(new_edge) + + def add_braille_numbers(self, **_args): + if self.contains_script['Braille']: + s = self.s + num_s, start = '', None + for i in range(len(s)): + char = s[i] + if char == '\u283C': # number mark + if start is None: + start = i + elif (start is not None) and (digit_s := self.braille_digit(char)): + num_s += digit_s + elif (start is not None) and (char == '\u2832'): # period + num_s += '.' + elif (start is not None) and (char == '\u2802'): # comma + num_s += ',' + elif isinstance(start, int): + self.add_braille_number(start, i, num_s) + num_s, start = '', None + if start is not None: + self.add_braille_number(start, len(s), num_s) + + def add_numbers(self, uroman, **args): + """Adds a numerical romanization edge to the romanization lattice, currently just for digits. + To be significantly expanded to cover complex Chinese, Egyptian, Amharic numbers.""" + verbose = bool(args.get('verbose')) + s = self.s + num_edges = [] + for start in range(len(s)): + char = s[start] + if uroman.num_props[char]: + new_edge = NumEdge(start, start + 1, char, uroman) + num_edges.append(new_edge) + if verbose: + print('NumEdge', new_edge) + self.add_edge(new_edge) + # D1 sequence of digits 1234 + for edge in num_edges: + if self.edge_is_digit(edge) and edge.active: # and (edge.value != 0): + n_decimal_points = 0 + n_decimals = None + new_value_s = str(edge.value) + sub_edges = [edge] + prev_edge = edge + while True: + right_edge = self.best_right_neighbor_edge(prev_edge.end) + if self.edge_is_digit(right_edge): + sub_edges.append(right_edge) + new_value_s += str(right_edge.value) + if n_decimals is not None: + n_decimals += 1 + prev_edge = right_edge + elif ((prev_edge.end < len(s)) and (s[prev_edge.end] == '.') and (n_decimal_points == 0) + and (right_edge2 := self.best_right_neighbor_edge(prev_edge.end + 1)) + and self.edge_is_digit(right_edge2)): + if right_edge is None: + right_edge = Edge(prev_edge.end, prev_edge.end+1, s[prev_edge.end], + 'decimal period') + self.add_edge(right_edge) + sub_edges.append(right_edge) + sub_edges.append(right_edge2) + new_value_s += '.' + str(right_edge2.value) + n_decimal_points += 1 + n_decimals = 1 + prev_edge = right_edge2 + else: + break + if len(sub_edges) >= 2: + new_value = float(new_value_s) if '.' in new_value_s else int(new_value_s) + new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True) + new_edge.update(value=new_value, value_s=new_value_s, n_decimals=n_decimals, num_base=1, + e_type='D1', script=sub_edges[-1].script) + self.add_edge(new_edge) + num_edges = self.update_edge_list(num_edges, new_edge, sub_edges) + if verbose: + print(new_edge.type, new_edge) + # G1 combine (*) "single digits" 2*100=200, 3*10= 30 + for edge in num_edges: + if (isinstance(edge, NumEdge) and edge.active and (edge.num_base == 1) + and isinstance(edge.value, int) and (edge.value >= 1)): + right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False) + if (right_edge + and isinstance(right_edge, NumEdge) + and right_edge.active + and isinstance(right_edge.value, int) + and (right_edge.num_base > 1) + and (not right_edge.is_large_power)): + new_value = edge.value * right_edge.value + new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True) + new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G1', + orig_txt=edge.orig_txt + right_edge.orig_txt, + script=right_edge.script) + self.add_edge(new_edge) + num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge]) + if verbose: + print(new_edge.type, new_edge) + # G2 combine (+) G1 "single digits" 200+30+4=234 (within larger blocks of 1000, 1000000) + for edge in num_edges: + if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int) and not edge.is_large_power: + sub_edges = [edge] + prev_edge = edge + prev_non_edge = edge # None if (edge.orig_txt in '零') else prev_edge + while (prev_edge + and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False)) + and isinstance(right_edge, NumEdge) + and right_edge.active + and isinstance(right_edge.value, int) + and (not right_edge.is_large_power) + and (self.is_gap_null_edge(prev_non_edge) + or ((prev_non_edge.num_base > right_edge.value) + and (prev_non_edge.num_base > right_edge.num_base)))): + sub_edges.append(right_edge) + prev_edge = right_edge + if not self.is_gap_null_edge(right_edge): + prev_non_edge = right_edge + if len(sub_edges) >= 2: + new_value = sum([e.value for e in sub_edges]) + new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True) + + new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G2', + orig_txt=''.join([e.orig_txt for e in sub_edges]), + script=sub_edges[-1].script) + self.add_edge(new_edge) + num_edges = self.update_edge_list(num_edges, new_edge, sub_edges) + new_edge.type = 'G2' + if verbose: + print(new_edge.type, new_edge) + # G3 combine (*) G2 blocks with large powers, e.g. 234*1000 = 234000 + for edge in num_edges: + if (isinstance(edge, NumEdge) and edge.active and (not edge.is_large_power) + and (isinstance(edge.value, int) or isinstance(edge.value, float))): + right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False) + if (right_edge + and isinstance(right_edge, NumEdge) + and right_edge.active + and isinstance(right_edge.value, int) + and (right_edge.num_base > 1) + and right_edge.is_large_power): + new_value = round(edge.value * right_edge.value, 5) + if isinstance(new_value, float) and new_value.is_integer(): + new_value = int(new_value) + new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True) + new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G3', + orig_txt=edge.orig_txt + right_edge.orig_txt, + script=right_edge.script) + self.add_edge(new_edge) + num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge]) + if verbose: + print(new_edge.type, new_edge) + # G4 combine (+) G3 blocks 234000+567=234567 + for edge in num_edges: + if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int): + sub_edges = [edge] + while ((prev_edge := sub_edges[-1]) + and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False)) + and isinstance(right_edge, NumEdge) + and right_edge.active + and isinstance(right_edge.value, int) + and (prev_edge.num_base > right_edge.value) + and (prev_edge.num_base > right_edge.num_base)): + if ((prev_edge.script == 'CJK') + and (prev_edge.num_base >= 1000) + and ('tag' not in prev_edge.type) + and regex.match('10+$', str(prev_edge.num_base)) + and (1 <= right_edge.value <= 9) + and (right_edge.start + 1 == right_edge.end)): + new_num_base = prev_edge.num_base // 10 + new_value = new_num_base * right_edge.value + # print('DIGIT TAG', prev_edge, right_edge, new_value) + right_edge.value = new_value + right_edge.num_base = new_num_base + right_edge.type = 'G4tag' + sub_edges.append(right_edge) + if len(sub_edges) >= 2: + new_value = sum([e.value for e in sub_edges]) + new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True) + new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G4', + orig_txt=''.join([e.orig_txt for e in sub_edges]), + script=sub_edges[-1].script) + self.add_edge(new_edge) + num_edges = self.update_edge_list(num_edges, new_edge, sub_edges) + if verbose: + print(new_edge.type, new_edge) + # F1 + for edge in num_edges: + # cushion fractions with spaces as needed: e.g. 23½ -> 23 1/2 or 十一五 -> 11 5 + if isinstance(edge, NumEdge) and regex.match(r'\d', edge.txt): + left_edge = self.best_left_neighbor_edge(edge.start) + if left_edge and regex.search(r'\d$', left_edge.txt): + if edge.fraction: + sep = ' ' + else: + sep = '·' + edge.txt = sep + edge.txt + + for edge in num_edges: + if (isinstance(edge, NumEdge) and edge.active and (edge.value is not None) + and (((edge.value > 1000) and (edge.start + 1 == edge.end)) + or (edge.orig_txt in '兩參参伍陆陸什') + or (edge.orig_txt in ('京兆', )))): + edge.active = False + if verbose: # or (num_edges and any([e.type in ['G1', 'G2', 'G3', 'G4'] for e in num_edges])): + if num_edges: + print('actives:') + for num_edge in num_edges: + print(num_edge) + for start in range(len(s)): + start_char = s[start] + if (best_edge := self.best_edge_in_span(start, start+1)) and isinstance(best_edge, NumEdge): + continue + if (num := ud_numeric(start_char)) is not None: + name = self.uroman.chr_name(start_char) + if ("DIGIT" in name) and isinstance(num, int) and (0 <= num <= 9): + # if start_char not in '0123456789': print('DIGIT', s[start], num, name) + self.add_edge(Edge(start, start + 1, str(num), 'num')) + else: + uroman.stats[('*NUM', start_char, num)] += 1 + + def add_rom_fall_back_singles(self, **_args): + """For characters in the original string not covered by romanizations and numbers, + add a fallback edge based on type, romanization of single char, or original char.""" + for start in range(self.max_vertex): + end = start+1 + orig_char = self.s[start] + if not self.lattice[(start, end)]: + rom, edge_annotation = orig_char, 'orig' + if self.uroman.char_is_nonspacing_mark(rom): + rom, edge_annotation = '', 'Mn' + elif self.uroman.char_is_format_char(rom): # e.g. zero-width non-joiner, zero-width joiner + rom, edge_annotation = '', 'Cf' + elif ud.category(orig_char) == 'Co': + rom, edge_annotation = '', 'Co' + elif rom == ' ': + edge_annotation = 'orig' + # elif self.uroman.char_is_space_separator(rom): + # rom, edge_annotation = ' ', 'Zs' + elif (rom2 := self.simple_top_romanization_candidate_for_span(start, end)) is not None: + rom = rom2 + if regex.match(r'\+(m|ng|n|h|r)', rom): + rom = rom[1:] + edge_annotation = 'rom single' + # else the original values still hold: rom, edge_annotation = orig_char, 'orig' + self.add_edge(Edge(start, end, rom, edge_annotation)) + + @staticmethod + def add_new_edge(old_edges: List[Edge], start: int, end: int, new_rom: str, new_type: str, position: int | None, + old_edge_dict: dict)\ + -> None: + if (start, end, new_rom) not in old_edge_dict: + new_edge = Edge(start, end, new_rom, new_type) + if position is None: + old_edges.append(new_edge) + else: + old_edges.insert(position + 1, new_edge) + old_edge_dict[(start, end, new_rom)] = new_edge + # print(f' ALT {start}-{end} {new_rom}') + + def add_alternatives(self, old_edges: List[Edge]) -> None: + old_edge_dict = {} + for old_edge in old_edges: + old_edge_dict[(old_edge.start, old_edge.end, old_edge.txt)] = old_edge + for position, old_edge in enumerate(old_edges): + if old_edge.type.startswith('rom-alt'): + continue # not old + start, end = old_edge.start, old_edge.end + orig_s = self.s[start:end] + old_rom = old_edge.txt + # self.lattice[(start, end)]: + for rom_rule in self.uroman.rom_rules[orig_s]: + rom_t = rom_rule['t'] + if self.cand_is_valid(rom_rule, start, end, rom_t): + rom_alts = rom_rule['t-alts'] + rom_eosyl = rom_rule['t-at-end-of-syllable'] + if (rom_t == old_rom) and rom_alts: + for rom_alt in rom_alts: + self.add_new_edge(old_edges, start, end, rom_alt, 'rom-alt', position, + old_edge_dict) + if (rom_t == old_rom) and rom_eosyl: + self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt2', position, old_edge_dict) + if rom_eosyl == old_rom: + self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt3', position, old_edge_dict) + + def all_edges(self, start: int, end: int) -> List[Edge]: + result = [] + for start2 in range(start, end): + for end2 in sorted(list(self.lattice[(start2, 'right')]), reverse=True): + if end2 <= end: + result.extend(self.lattice[(start2, end2)]) + else: + break + return result + + def best_edge_in_span(self, start: int, end: int, skip_num_edge: bool = False) -> Edge | None: + edges = self.lattice[(start, end)] + # if len(edges) >= 2: print('Multi edge', start2, end2, self.s[start2:end2], edges) + decomp_edge, other_edge, rom_edge = None, None, None + for edge in edges: + if isinstance(edge, NumEdge): + if skip_num_edge: + continue + if edge.active: + return edge + if edge.type.startswith('rom decomp'): + if decomp_edge is None: + decomp_edge = edge # plan C + elif regex.match(r'(?:rom|num)', edge.type): + if rom_edge is None: + rom_edge = edge # plan B + elif other_edge is None: + other_edge = edge # plan D + return rom_edge or decomp_edge or other_edge + + def best_right_neighbor_edge(self, start: int, skip_num_edge: bool = False) -> Edge | None: + for end in sorted(list(self.lattice[(start, 'right')]), reverse=True): + if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge): + return best_edge + return None + + def best_left_neighbor_edge(self, end: int, skip_num_edge: bool = False) -> Edge | None: + for start in sorted(list(self.lattice[(end, 'left')])): + if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge): + return best_edge + return None + + def best_rom_edge_path(self, start: int, end: int, skip_num_edge: bool = False) -> List[Edge]: + """Finds the best romanization edge path through the romanization lattice, including + non-romanized pieces such as ASCII and non-ASCII punctuation.""" + result = [] + start2 = start + while start2 < end: + if best_edge := self.best_right_neighbor_edge(start2, skip_num_edge=skip_num_edge): + result.append(best_edge) + start2 = best_edge.end + else: # should not happen + start2 += 1 + return result + + def find_rom_edge_path_backwards(self, start: int, end: int, min_char: int | None = None, + return_str: bool = False, skip_num_edge: bool = False) -> List[Edge] | str: + """Finds a partial best path on the left from a start position to provide left contexts for + romanization rules. Can return a string or a list of edges. Is typically used for a short context, + as specified by min_char.""" + result_edges = [] + rom = '' + end2 = end + while start < end2: + old_end2 = end2 + if new_edge := self.best_left_neighbor_edge(end2, skip_num_edge=skip_num_edge): + result_edges = [new_edge] + result_edges + rom = new_edge.txt + rom + end2 = new_edge.start + if min_char and len(rom) >= min_char: + break + if old_end2 >= end2: + end2 -= 1 + if return_str: + return rom + else: + return result_edges + + @staticmethod + def edge_path_to_surf(edges) -> str: + result = '' + for edge in edges: + result += edge.txt + return result + + +# @timer +def main(): + """This function provides a user interface, either using argparse for a command line interface, + or providing direct function calls. + First, a uroman object will have to created, loading uroman data (directory must be provided, + listed as default). This only needs to be done once. + After that you can romanize from file to file, or just romanize a string.""" + + # Compute data_dir based on the location of this executable script. + src_dir = os.path.dirname(os.path.realpath(__file__)) + root_dir = os.path.dirname(src_dir) + data_dir = os.path.join(root_dir, "data") + # print(src_dir, root_dir, data) + + parser = argparse.ArgumentParser() + parser.add_argument('direct_input', nargs='*', type=str) + parser.add_argument('--data_dir', type=Path, default=data_dir, help='uroman resource dir') + parser.add_argument('-i', '--input_filename', type=str, help='default: sys.stdin') + parser.add_argument('-o', '--output_filename', type=str, help='default: sys.stdout') + parser.add_argument('-l', '--lcode', type=str, default=None, + help='ISO 639-3 language code, e.g. eng') + # parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, help:'alt: RomFormat.EDGES') + parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, + choices=list(RomFormat), help="Output format of romanization. 'edges' provides offsets") + # The remaining arguments are mostly for development and test + parser.add_argument('--max_lines', type=int, default=None, help='limit uroman to first n lines') + parser.add_argument('--load_log', action='count', default=0, help='report load stats') + parser.add_argument('--test', action='count', default=0, help='perform/display a few tests') + parser.add_argument('-v', '--verbose', action='count', default=0) + parser.add_argument('--rebuild_ud_props', action='count', default=0, + help='rebuild UnicodeDataProps files (for development mode only)') + parser.add_argument('--rebuild_num_props', action='count', default=0, + help='rebuild NumProps file (for development mode only)') + parser.add_argument('--no_caching', action='count', default=0, help='for development mode: speed') + parser.add_argument('--silent', action='count', default=0, help='suppress ... progress') + parser.add_argument('-a', '--ablation', type=str, default='', help='for development mode: nocap') + parser.add_argument('--stats', action='count', default=0, help='for development mode: numbers') + parser.add_argument('--ignore_args', action='count', default=0, help='for usage illustration only') + parser.add_argument(PROFILE_FLAG, type=argparse.FileType('w', encoding='utf-8', errors='ignore'), + default=None, metavar='PROFILE-FILENAME', help='(optional output for performance analysis)') + args = parser.parse_args() + # copy selected (minor) args from argparse.Namespace to dict + args_dict = {'rom_format': args.rom_format, 'load_log': args.load_log, 'test': args.test, 'stats': args.stats, + 'no_caching': args.no_caching, 'max_lines': args.max_lines, 'verbose': args.verbose, + 'rebuild_ud_props': args.rebuild_ud_props, 'rebuild_num_props': args.rebuild_num_props, + 'ablation': args.ablation, 'silent': args.silent} + pr = None + if args.profile: + gc.enable() + gc.set_debug(gc.DEBUG_STATS) + gc.set_debug(gc.DEBUG_LEAK) + pr = cProfile.Profile() + pr.enable() + '''Sample calls: +uroman.py --help +uroman.py -i ../test/multi-script.txt -o ../test/multi-script-out2.txt +uroman.py < ../test/multi-script.txt > ../test/multi-script-out2.txt +uroman.py Игорь +uroman.py Игорь --lcode ukr +uroman.py ألاسكا 서울 Καλιφόρνια +uroman.py ちょっとまってください -f edges +uroman.py "महात्मा गांधी" -f lattice +uroman.py สวัสดี --load_log +uroman.py --test +uroman.py --ignore_args +uroman.py Բարեւ -o ../test/tmp-out.txt -f edges +# In double input cases such as in the line below, +# the input-file's romanization is sent to stdout, while the direct-input romanization is sent to stderr +uroman.py ⴰⵣⵓⵍ -i ../test/multi-script.txt > ../test/multi-script-out2.txt + ''' + + if args.ignore_args: + # minimal calls + uroman = Uroman(args.data_dir) + s, s2, s3, s4 = 'Игорь', 'ちょっとまってください', 'ka‍n‍ne', 'महात्मा गांधी' + print(s, uroman.romanize_string(s)) + print(s, uroman.romanize_string(s, lcode='ukr')) + print(s2, Edge.json_str(uroman.romanize_string(s2, rom_format=RomFormat.EDGES))) + print(s3, Edge.json_str(uroman.romanize_string(s3, rom_format=RomFormat.EDGES))) + print(s4, Edge.json_str(uroman.romanize_string(s4, rom_format=RomFormat.LATTICE))) + # Note that ../test/multi-script.txt has several lines starting with ::lcode eng etc. + # This allows users to select specific language codes to specific lines, overwriting the overall --lcodes + uroman.romanize_file(input_filename='../test/multi-script.txt', + output_filename='../test/multi-script-out3.txt') + else: + # build a Uroman object (once for many applications and different scripts and languages) + uroman = Uroman(args.data_dir, load_log=args.load_log, rebuild_ud_props=args.rebuild_ud_props, + rebuild_num_props=args.rebuild_num_props) + romanize_file_p = (args.input_filename or args.output_filename + or not (args.direct_input or args.test or args.ignore_args + or args.rebuild_ud_props or args.rebuild_num_props)) + # Romanize any positional arguments, interpreted as strings to be romanized. + for s in args.direct_input: + result = uroman.romanize_string(s.rstrip(), lcode=args.lcode, **args_dict) + result_json = Edge.json_str(result) + if romanize_file_p: + # input from both file/stdin (to file/stdout) and direct-input (to stderr) + if args.input_filename: + sys.stderr.write(result_json + '\n') + # input from direct-input (but not from file/stdin) to stdout + # else pass + # no file/stdin or file/stdout, so we write romanization of direct-input to stdout + else: + print(result_json) + # If provided, apply romanization to an entire file. + if romanize_file_p: + uroman.romanize_file(args.input_filename, args.output_filename, lcode=args.lcode, + direct_input=args.direct_input, **args_dict) + if args.test: + uroman.test_output_of_selected_scripts_and_rom_rules() + uroman.test_romanization() + if uroman.stats and args.stats: + stats100 = {k: uroman.stats[k] for k in list(dict(uroman.stats))[:100]} + sys.stderr.write(f'Stats: {stats100} ...\n') + if args.profile: + if pr: + pr.disable() + ps = pstats.Stats(pr, stream=args.profile).sort_stats(pstats.SortKey.TIME) + ps.print_stats() + print(gc.get_stats()) + + +if __name__ == "__main__": + main()