diff --git "a/uroman/bin/uroman.py" "b/uroman/bin/uroman.py"
new file mode 100644--- /dev/null
+++ "b/uroman/bin/uroman.py"
@@ -0,0 +1,2278 @@
+#!/usr/bin/env python
+
+"""
+Written by Ulf Hermjakob, USC/ISI  March-April 2024
+uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
+This script is a Python reimplementation of an earlier Perl script, with some improvements.
+The tool has been tested on 250 languages, with 100 or more sentences each.
+This script is still under development and large-scale testing. Feedback welcome.
+This script provides token-size caching (for faster runtimes).
+Output formats include
+  (1) best romanization string
+  (2) best romanization edges ("best path"; incl. start and end positions with respect to the original string)
+  (3) best romanization with alternatives (as applicable for ambiguous romanization)
+  (4) best romanization full lattice (all edges, including superseded sub-edges)
+See below for 'sample calls' under main()
+"""
+
+
+from __future__ import annotations
+import argparse
+from collections import defaultdict
+# from memory_profiler import profile
+import datetime
+from enum import Enum
+from fractions import Fraction
+import gc
+import json
+import math
+import os
+import pathlib
+from pathlib import Path
+import pstats
+import regex
+import sys
+from typing import List, Tuple
+import unicodedata as ud
+PROFILE_FLAG = "--profile"  # also used in argparse processing
+if PROFILE_FLAG in sys.argv:
+    import cProfile
+
+# UTILITIES
+
+
+def timer(func):
+    def wrapper(*args, **kwargs):
+        start_time = datetime.datetime.now()
+        print(f"Calling: {func.__name__}{args}")
+        print(f"Start time: {start_time:%A, %B %d, %Y at %H:%M}")
+        result = func(*args, **kwargs)
+        end_time = datetime.datetime.now()
+        time_diff = (end_time-start_time).total_seconds()
+        print(f"End time: {end_time:%A, %B %d, %Y at %H:%M}")
+        print(f"Duration: {time_diff} seconds")
+        return result
+    return wrapper
+
+
+def slot_value_in_double_colon_del_list(line: str, slot: str, default: str | list | None = None) -> str | list | None:
+    """For a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3
+    The value can be an empty string, as for ::s2 in the example above."""
+    m = regex.match(fr'(?:.*\s)?::{slot}(|\s+\S.*?)(?:\s+::\S.*|\s*)$', line)
+    return m.group(1).strip() if m else default
+
+
+def has_value_in_double_colon_del_list(line: str, slot: str) -> bool:
+    return isinstance(slot_value_in_double_colon_del_list(line, slot), str)
+
+
+def dequote_string(s: str) -> str:
+    if isinstance(s, str):
+        m = regex.match(r'''\s*(['"“])(.*)(['"”])\s*$''', s)
+        if m and ((m.group(1) + m.group(3)) in ("''", '""', '“”')):
+            return m.group(2)
+    return s
+
+
+def last_chr(s: str) -> str:
+    if len(s):
+        return s[len(s)-1]
+    else:
+        ''
+
+
+def ud_numeric(char: str) -> int | float | None:
+    try:
+        num_f = ud.numeric(char)
+        return int(num_f) if num_f.is_integer() else num_f
+    except (ValueError, TypeError):
+        return None
+
+
+def robust_str_to_num(num_s: str, filename: str = None, line_number: int | None = None, silent: bool = False) \
+        -> int | float | None:
+    if isinstance(num_s, str):
+        try:
+            return float(num_s) if "." in num_s else int(num_s)
+        except ValueError:
+            if not silent:
+                sys.stderr.write(f'Cannot convert "{num_s}" to a number')
+                if line_number:
+                    sys.stderr.write(f' line: {line_number}')
+                if filename:
+                    sys.stderr.write(f' file: {filename}')
+                sys.stderr.write(f'\n')
+    elif isinstance(num_s, float) or isinstance(num_s, int):
+        return num_s
+    return None
+
+
+def first_non_none(*args):
+    for arg in args:
+        if arg is not None:
+            return arg
+    return None
+
+
+def any_not_none(*args) -> bool:
+    for arg in args:
+        if arg is not None:
+            return True
+    return False
+
+
+def add_non_none_to_dict(d: dict, key: str, value) -> None:
+    if value is not None:
+        d[key] = value
+
+
+def fraction_char2fraction(fraction_char: str, fraction_value: float | None = None,
+                           uroman: Uroman | None = None) -> Fraction | None:
+    s = ''
+    fraction = None
+    for ud_decomp_elem in ud.decomposition(fraction_char).split():
+        try:
+            s += chr(int(ud_decomp_elem, 16))
+        except ValueError:
+            s += ud_decomp_elem
+    if m := regex.match(r'<fraction>(\d+)⁄(\d+)$', s):
+        numerator_s, denominator_s = m.group(1, 2)
+        try:
+            fraction = Fraction(int(numerator_s), int(denominator_s))
+        except ValueError:
+            fraction = None
+    if (fraction is None) and uroman and fraction_value:
+        if num_denom := uroman.unicode_float2fraction(fraction_value):
+            try:
+                fraction = Fraction(num_denom[0], num_denom[1])
+            except ValueError:
+                fraction = None
+    return fraction
+
+
+def chr_name(char: str) -> str:
+    """robust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txt"""
+    try:
+        return ud.name(char)
+    except (ValueError, TypeError):
+        return ''
+
+
+def args_get(key: str, args: argparse.Namespace | None = None):
+    return vars(args)[key] if args and (key in args) else None
+
+
+class DictClass:
+    def __init__(self, **kw_args):
+        for kw_arg in kw_args:
+            kw_arg2 = kw_arg.replace('_', '-')
+            value = kw_args[kw_arg]
+            if not (value in (None, [], False)):
+                self.__dict__[kw_arg2] = value
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def __getitem__(self, key, default=None):
+        return self.__dict__[key] if key in self.__dict__ else default
+
+    def __bool__(self):
+        return len(self.__dict__) > 0
+
+
+class RomRule(DictClass):
+    # key: source string
+    # typical attributes: s (source), t (target), prov (provenance), lcodes (language codes)
+    # t_alts=t_alts (target alternatives), use_only_at_start_of_word, dont_use_at_start_of_word,
+    # use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word
+    pass
+
+
+class Script(DictClass):
+    # key: lower case script_name
+    # typical attributes: script_name, direction, abugida_default_vowels, alt_script_names, languages
+    pass
+
+
+class RomFormat(Enum):
+    """Output format of romanization"""
+    STR = 'str'          # simple string
+    EDGES = 'edges'      # list of edges (includes character offsets in original string)
+    ALTS = 'alts'        # lattice including alternative edges
+    LATTICE = 'lattice'  # lattice including alternative and superseded edges
+
+    def __str__(self):
+        return self.value
+
+
+class Uroman:
+    """This class loads and maintains uroman data independent of any specific text corpus.
+    Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.)
+    Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file
+    (romanize_file())."""
+    def __init__(self, data_dir: Path, **args):  # args: load_log, rebuild_ud_props
+        self.data_dir = data_dir
+        self.rom_rules = defaultdict(list)
+        self.scripts = defaultdict(Script)
+        self.dict_bool = defaultdict(bool)
+        self.dict_str = defaultdict(str)
+        self.dict_int = defaultdict(int)
+        self.dict_num = defaultdict(lambda: None)   # values are int (most common), float, or str ("1/2")
+        # num_props key: txt
+        # values:  {"txt": "\u137b", "rom": "100", "value": 100, "type": "base", "mult": 1, "script": "Ethiopic"}
+        self.num_props = defaultdict(dict)
+        self.dict_set = defaultdict(set)
+        self.float2fraction = {}  # caching
+        gc.disable()
+        self.load_resource_files(data_dir, args.get('load_log', False),
+                                 args.get('rebuild_ud_props', False),
+                                 args.get('rebuild_num_props', False))
+        gc.enable()
+        self.hangul_rom = {}
+        self.rom_cache = {}   # key: (s, lcode) value: t
+        self.stats = defaultdict(int)  # stats, e.g. for unprocessed numbers
+        self.abugida_cache = {}  # key: (script, char_rom) value: (base_rom, base_rom_plus_abugida_vowel, modified rom)
+
+    def second_rom_filter(self, c: str, rom: str, name: str | None) -> Tuple[str | None, str]:
+        """Much of this code will eventually move the old Perl code to generate cleaner primary data"""
+        if rom and (' ' in rom):
+            if name is None:
+                name = self.chr_name(c)
+            if "MYANMAR VOWEL SIGN KAYAH" in name:
+                if m := regex.search(r'kayah\s+(\S+)\s*$', rom):
+                    return m.group(1), name
+            if "MENDE KIKAKUI SYLLABLE" in name:
+                if m := regex.search(r'm\d+\s+(\S+)\s*$', rom):
+                    return m.group(1), name
+            if regex.search(r'\S\s+\S', rom):
+                return c, name
+        return None, name
+
+    def load_rom_file(self, filename: str, provenance: str, file_format: str = None, load_log: bool = True):
+        """Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt
+        which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects"
+        some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually
+        created and allows complex romanization rules, some for specific languages, some for specific contexts."""
+        n_entries = 0
+        try:
+            f = open(filename)
+        except FileNotFoundError:
+            sys.stderr.write(f'Cannot open file {filename}\n')
+            return
+        with (f):
+            for line_number, line in enumerate(f, 1):
+                if line.startswith('#'):
+                    continue
+                if regex.match(r'^\s*$', line):  # blank line
+                    continue
+                line = regex.sub(r'\s{2,}#.*$', '', line)
+                if file_format == 'u2r':
+                    t_at_end_of_syllable = None
+                    u = dequote_string(slot_value_in_double_colon_del_list(line, 'u'))
+                    try:
+                        cp = int(u, 16)
+                        s = chr(cp)
+                    except ValueError:
+                        continue
+                    t = dequote_string(slot_value_in_double_colon_del_list(line, 'r'))
+                    if name := slot_value_in_double_colon_del_list(line, 'name'):
+                        self.dict_str[('name', s)] = name
+                    if pic := slot_value_in_double_colon_del_list(line, 'pic'):
+                        self.dict_str[('pic', s)] = pic
+                    if tone_mark := slot_value_in_double_colon_del_list(line, 'tone-mark'):
+                        self.dict_str[('tone-mark', s)] = tone_mark
+                    if syllable_info := slot_value_in_double_colon_del_list(line, 'syllable-info'):
+                        self.dict_str[('syllable-info', s)] = syllable_info
+                else:
+                    s = dequote_string(slot_value_in_double_colon_del_list(line, 's'))
+                    t = dequote_string(slot_value_in_double_colon_del_list(line, 't'))
+                    t_at_end_of_syllable = dequote_string(slot_value_in_double_colon_del_list(line,
+                                                                                              't-end-of-syllable'))
+                if (num_s := slot_value_in_double_colon_del_list(line, 'num')) is not None:
+                    num = robust_str_to_num(num_s)
+                    self.dict_num[s] = (num_s if (num is None) else num)
+                is_minus_sign = has_value_in_double_colon_del_list(line, 'is-minus-sign')
+                is_plus_sign = has_value_in_double_colon_del_list(line, 'is-plus-sign')
+                is_decimal_point = has_value_in_double_colon_del_list(line, 'is-decimal-point')
+                is_large_power = has_value_in_double_colon_del_list(line, 'is-large-power')
+                fraction_connector = slot_value_in_double_colon_del_list(line, 'fraction-connector')
+                percentage_marker = slot_value_in_double_colon_del_list(line, 'percentage-marker')
+                int_frac_connector = slot_value_in_double_colon_del_list(line, 'int-frac-connector')
+                lcode_s = slot_value_in_double_colon_del_list(line, 'lcode')
+                lcodes = regex.split(r'[,;]\s*', lcode_s) if lcode_s else []
+                use_only_at_start_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-start-of-word')
+                dont_use_at_start_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-start-of-word')
+                use_only_at_end_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-end-of-word')
+                dont_use_at_end_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-end-of-word')
+                use_only_for_whole_word = has_value_in_double_colon_del_list(line, 'use-only-for-whole-word')
+                num_s = slot_value_in_double_colon_del_list(line, 'num')
+                num = robust_str_to_num(num_s, filename, line_number, silent=False)
+                t_alt_s = slot_value_in_double_colon_del_list(line, 't-alt')
+                t_alts = regex.split(r'[,;]\s*', t_alt_s) if t_alt_s else []
+                t_alts = list(map(dequote_string, t_alts))
+                t_mod, name2 = self.second_rom_filter(s, t, None)
+                if t_mod and (t_mod != t):
+                    if t != s:
+                        pass  # sys.stderr.write(f'UPDATE: {s} {name2} {t} -> {t_mod}\n')
+                    t = t_mod
+                if s is not None:
+                    for bool_key in ('is-large-power', 'is-minus-sign', 'is-plus-sign', 'is-decimal-point'):
+                        bool_value = eval(bool_key.replace('-', '_'))
+                        if bool_value:
+                            self.dict_bool[(bool_key, s)] = True
+                    if any_not_none(t, num, is_minus_sign, is_plus_sign, is_decimal_point, is_large_power,
+                                    fraction_connector, percentage_marker, int_frac_connector):
+                        self.register_s_prefix(s)
+                        n_entries += 1
+                        # if regex.match(r'[\u2800-\u28FF]', s): print("Braille", s, t)
+                        restrictions = [lcodes, use_only_at_start_of_word, dont_use_at_start_of_word,
+                                        use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word]
+                        n_restrictions = len([restr for restr in restrictions if restr])
+                        provenance2 = provenance
+                        if (t is None) and (num is not None) and (provenance2 == "rom"):
+                            provenance2 = "num"
+                        new_rom_rule = RomRule(s=s, t=t, prov=provenance2, lcodes=lcodes, t_alts=t_alts, num=num,
+                                               use_only_at_start_of_word=use_only_at_start_of_word,
+                                               dont_use_at_start_of_word=dont_use_at_start_of_word,
+                                               use_only_at_end_of_word=use_only_at_end_of_word,
+                                               dont_use_at_end_of_word=dont_use_at_end_of_word,
+                                               use_only_for_whole_word=use_only_for_whole_word,
+                                               t_at_end_of_syllable=t_at_end_of_syllable,
+                                               n_restr=n_restrictions,
+                                               is_minus_sign=is_minus_sign,
+                                               is_plus_sign=is_plus_sign,
+                                               is_decimal_point=is_decimal_point,
+                                               fraction_connector=fraction_connector,
+                                               percentage_marker=percentage_marker,
+                                               int_frac_connector=int_frac_connector,
+                                               is_large_power=is_large_power)
+                        old_rom_rules = self.rom_rules[s]
+                        if ((len(old_rom_rules) == 1) and (old_rom_rules[0]['prov'] in ('ud', 'ow'))
+                                and not (lcodes or use_only_at_start_of_word or dont_use_at_start_of_word
+                                         or use_only_at_end_of_word or dont_use_at_end_of_word
+                                         or use_only_for_whole_word)):
+                            self.rom_rules[s] = [new_rom_rule]  # overwrite
+                        else:
+                            self.rom_rules[s].append(new_rom_rule)
+        # Thai
+        thai_cancellation_mark = '\u0E4C'
+        # cancellation applies to preceding letter incl. any vowel modifier letter (e.g. ศักดิ์สิทธิ์ -> saksit)
+        for cp in range(0x0E01, 0x0E4C):   # Thai
+            c = chr(cp)
+            s = c + thai_cancellation_mark
+            new_rom_rule = RomRule(s=s, t='', prov='auto cancel letter')
+            if not self.rom_rules[s]:
+                self.rom_rules[s] = [new_rom_rule]
+                self.register_s_prefix(s)
+        thai_consonants = list(map(chr, range(0x0E01, 0x0E2F)))
+        thai_vowel_modifiers = ['\u0E31', '\u0E47'] + list(map(chr, range(0x0E33, 0x0E3B)))
+        for c1 in thai_consonants:
+            for v in thai_vowel_modifiers:
+                s = c1 + v + thai_cancellation_mark
+                new_rom_rule = RomRule(s=s, t='', prov='auto cancel syllable')
+                if not self.rom_rules[s]:
+                    self.rom_rules[s] = [new_rom_rule]
+                    self.register_s_prefix(s)
+        if load_log:
+            sys.stderr.write(f'Loaded {n_entries} from {filename}\n')
+
+    def load_script_file(self, filename: str, load_log: bool = True):
+        """Reads in (typically from Scripts.txt) information about various scripts such as Devanagari,
+        incl. information such as the default abugida vowel letter (e.g. "a")."""
+        n_entries, max_n_script_name_components = 0, 0
+        try:
+            f = open(filename)
+        except FileNotFoundError:
+            sys.stderr.write(f'Cannot open file {filename}\n')
+            return
+        with f:
+            for line_number, line in enumerate(f, 1):
+                if line.startswith('#'):
+                    continue
+                if regex.match(r'^\s*$', line):  # blank line
+                    continue
+                line = regex.sub(r'\s{2,}#.*$', '', line)
+                if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
+                    lc_script_name = script_name.lower()
+                    if lc_script_name in self.scripts:
+                        sys.stderr.write(f'** Ignoring duplicate script "{script_name}" '
+                                         f'in line {line_number} of {filename}\n')
+                    else:
+                        n_entries += 1
+                        direction = slot_value_in_double_colon_del_list(line, 'direction')
+                        abugida_default_vowel_s = slot_value_in_double_colon_del_list(line,
+                                                                                      'abugida-default-vowel')
+                        abugida_default_vowels = regex.split(r'[,;]\s*', abugida_default_vowel_s) \
+                            if abugida_default_vowel_s else []
+                        alt_script_name_s = slot_value_in_double_colon_del_list(line, 'alt-script-name')
+                        alt_script_names = regex.split(r'[,;]\s*', alt_script_name_s) if alt_script_name_s else []
+                        language_s = slot_value_in_double_colon_del_list(line, 'language')
+                        languages = regex.split(r'[,;]\s*', language_s) if language_s else []
+                        new_script = Script(script_name=script_name, alt_script_names=alt_script_names,
+                                            languages=languages, direction=direction,
+                                            abugida_default_vowels=abugida_default_vowels)
+                        self.scripts[lc_script_name] = new_script
+                        for language in languages:
+                            self.dict_set[('scripts', language)].add(script_name)
+                        for alt_script_name in alt_script_names:
+                            lc_alt_script_name = alt_script_name.lower()
+                            if lc_alt_script_name in self.scripts:
+                                sys.stderr.write(f'** Ignoring duplicate alternative script name "{script_name}" '
+                                                 f'in line {line_number} of {filename}\n')
+                            else:
+                                self.scripts[lc_alt_script_name] = new_script
+                    n_script_name_components = len(script_name.split())
+                    if n_script_name_components > max_n_script_name_components:
+                        max_n_script_name_components = n_script_name_components
+        if max_n_script_name_components:
+            self.dict_int['max_n_script_name_components'] = max_n_script_name_components
+        if load_log:
+            sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}'
+                             f' (max_n_scripts_name_components: {max_n_script_name_components})\n')
+
+    def extract_script_name(self, script_name_plus: str, full_char_name: str = None) -> str | None:
+        """Using info from Scripts.txt, this script selects the script name from a Unicode,
+        e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian"."""
+        if full_char_name and script_name_plus == full_char_name:
+            return None
+        while script_name_plus:
+            if script_name_plus.lower() in self.scripts:
+                if script := self.scripts[script_name_plus.lower()]:
+                    if script_name := script['script-name']:
+                        return script_name
+            script_name_plus = regex.sub(r'\s*\S*\s*$', '', script_name_plus)
+        return None
+
+    def load_unicode_data_props(self, filename: str, load_log: bool = True):
+        """Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
+        and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
+        n_script, n_script_char, n_script_vowel_sign, n_script_medial_consonant_sign, n_script_virama = 0, 0, 0, 0, 0
+        try:
+            f = open(filename)
+        except FileNotFoundError:
+            sys.stderr.write(f'Cannot open file {filename}\n')
+            return
+        with f:
+            for line_number, line in enumerate(f, 1):
+                if line.startswith('#'):
+                    continue
+                if regex.match(r'^\s*$', line):  # blank line
+                    continue
+                line = regex.sub(r'\s{2,}#.*$', '', line)
+                if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
+                    n_script += 1
+                    for char in slot_value_in_double_colon_del_list(line, 'char', []):
+                        self.dict_str[('script', char)] = script_name
+                        n_script_char += 1
+                    for char in slot_value_in_double_colon_del_list(line, 'numeral', []):
+                        self.dict_str[('script', char)] = script_name
+                        n_script_char += 1
+                    for char in slot_value_in_double_colon_del_list(line, 'vowel-sign', []):
+                        self.dict_bool[('is-vowel-sign', char)] = True
+                        n_script_vowel_sign += 1
+                    for char in slot_value_in_double_colon_del_list(line, 'medial-consonant-sign', []):
+                        self.dict_bool[('is-medial-consonant-sign', char)] = True
+                        n_script_medial_consonant_sign += 1
+                    for char in slot_value_in_double_colon_del_list(line, 'sign-virama', []):
+                        self.dict_bool[('is-virama', char)] = True
+                        n_script_virama += 1
+        if load_log:
+            sys.stderr.write(f'Loaded from {filename} mappings of {n_script_char:,d} characters '
+                             f'to {n_script} script{"" if n_script == 1 else "s"}')
+            if n_script_vowel_sign or n_script_virama or n_script_medial_consonant_sign:
+                sys.stderr.write(f', with a total of {n_script_vowel_sign} vowel signs, '
+                                 f'{n_script_medial_consonant_sign} medial consonant signs '
+                                 f'and {n_script_virama} viramas')
+            sys.stderr.write('.\n')
+
+    def load_num_props(self, filename: str, load_log: bool = True):
+        """Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
+        and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
+        n_entries = 0
+        try:
+            f = open(filename)
+        except FileNotFoundError:
+            sys.stderr.write(f'Cannot open file {filename}\n')
+            return
+        with f:
+            for line_number, line in enumerate(f, 1):
+                if line.startswith('#'):
+                    continue
+                if regex.match(r'^\s*$', line):  # blank line
+                    continue
+                d = json.loads(line)
+                if isinstance(d, dict):
+                    if txt := d.get('txt'):
+                        self.num_props[txt] = d
+                        n_entries += 1
+                    else:
+                        sys.stderr.write(f'Missing txt in l.{line_number} in file {filename}: {line.strip()}\n')
+                    for bool_key in ('is-large-power',):
+                        if d.get(bool_key):
+                            self.dict_bool[(bool_key, txt)] = True
+                else:
+                    sys.stderr.write(f'json in l.{line_number} in file {filename} not a dict: {line.strip()}\n')
+        if load_log:
+            sys.stderr.write(f'Loaded {n_entries} entries from {filename}\n')
+
+    @staticmethod
+    def de_accent_pinyin(s: str) -> str:
+        """De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt)."""
+        result = ''
+        for char in s:
+            if decomp := ud.decomposition(char).split():
+                try:
+                    decomp_chars = [chr(int(x, 16)) for x in decomp]
+                    letters = [x for x in decomp_chars if ud.category(x).startswith('L')]
+                except ValueError:
+                    sys.stderr.write(f'Cannot decode {decomp}\n')
+                    continue
+                if len(letters) == 1:
+                    result += letters[0]
+                else:
+                    sys.stderr.write(f'Cannot decode {decomp} (expected 1 letter)\n')
+            else:
+                result += char
+        result = result.replace('ü', 'u')
+        return result
+
+    def register_s_prefix(self, s: str):
+        for prefix_len in range(1, len(s) + 1):
+            self.dict_bool[('s-prefix', s[:prefix_len])] = True
+
+    def load_chinese_pinyin_file(self, filename: str, load_log: bool = True):
+        """Loads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form."""
+        n_entries = 0
+        try:
+            f = open(filename)
+        except FileNotFoundError:
+            sys.stderr.write(f'Cannot open file {filename}\n')
+            return
+        with f:
+            for line_number, line in enumerate(f, 1):
+                if line.startswith('#'):
+                    continue
+                if regex.match(r'^\s*$', line):  # blank line
+                    continue
+                try:
+                    chinese, pinyin = line.rstrip().split()
+                    rom = self.de_accent_pinyin(pinyin)
+                except ValueError:
+                    sys.stderr.write(f'Cannot process line {line_number} in file {filename}: {line}')
+                else:
+                    s = chinese
+                    new_rom_rule = RomRule(s=s, t=rom, prov='rom pinyin', lcodes=[])
+                    self.rom_rules[chinese].append(new_rom_rule)
+                    self.register_s_prefix(s)
+                    n_entries += 1
+        if load_log:
+            sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}\n')
+
+    @staticmethod
+    def add_char_to_rebuild_unicode_data_dict(d: dict, script_name: str, prop_class: str, char: str):
+        d['script-names'].add(script_name)
+        key = (script_name, prop_class)
+        if key in d:
+            d[key].append(char)
+        else:
+            d[key] = [char]
+
+    def rebuild_unicode_data_props(self, out_filename: str, cjk: str = None, hangul: str = None):
+        """This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt
+        version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt
+        Regular users normally never have to call this function."""
+        d = {'script-names': set()}
+        n_script_refs = 0
+        codepoint = -1
+        prop_classes = {'char'}
+        while codepoint < 0xF0000:
+            codepoint += 1
+            c = chr(codepoint)
+            if not (char_name := self.chr_name(c)):
+                continue
+            for prop_name_comp2 in ('VOWEL SIGN',
+                                    ('MEDIAL CONSONANT SIGN', 'CONSONANT SIGN MEDIAL', 'CONSONANT SIGN SHAN MEDIAL',
+                                     'CONSONANT SIGN MON MEDIAL'),
+                                    ('SIGN VIRAMA', 'SIGN ASAT', 'AL-LAKUNA', 'SIGN COENG', 'SIGN PAMAAEH',
+                                     'CHARACTER PHINTHU'),
+                                    ('NUMERAL', 'NUMBER', 'DIGIT', 'FRACTION')):
+                if prop_name_comp2 and isinstance(prop_name_comp2, tuple):
+                    prop_list = prop_name_comp2
+                else:
+                    prop_list = (prop_name_comp2,)
+                for prop_name_comp in prop_list:
+                    prop_class = prop_list[0].lower().replace(' ', '-')
+                    if prop_class not in prop_classes:
+                        prop_classes.add(prop_class)
+                    script_name_cand = regex.sub(fr'\s+{prop_name_comp}\b.*$', '', char_name)
+                    if script_name := self.extract_script_name(script_name_cand, char_name):
+                        self.add_char_to_rebuild_unicode_data_dict(d, script_name, prop_class, c)
+            script_name_cand = regex.sub(r'\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL|'
+                                         r'IDEOGRAPH|HIEROGLYPH|POINT|ACCENT|CHARACTER|TIPPI|ADDAK|IRI|URA|'
+                                         r'SYMBOL GENITIVE|SYMBOL COMPLETED|SYMBOL LOCATIVE|SYMBOL AFOREMENTIONED|'
+                                         r'AU LENGTH MARK)\b.*$', '',
+                                         char_name)
+            if script_name := self.extract_script_name(script_name_cand, char_name):
+                self.add_char_to_rebuild_unicode_data_dict(d, script_name, 'char', c)
+                n_script_refs += 1
+        # print(sorted(d['script-names']))
+        prop_classes = sorted(prop_classes)
+        out_filenames = [x for x in [out_filename, cjk, hangul] if x]
+        cjk2 = cjk if cjk else out_filename
+        hangul2 = hangul if hangul else out_filename
+        for out_file in out_filenames:
+            try:
+                f_out = open(out_file, 'w')
+            except OSError:
+                sys.stderr.write(f'Cannot write to file {out_file}\n')
+                continue
+            with f_out:
+                for script_name in sorted(d['script-names']):
+                    if script_name == 'CJK':
+                        if out_file != cjk2:
+                            continue
+                    elif script_name == 'Hangul':
+                        if out_file != hangul2:
+                            continue
+                    else:
+                        if out_file != out_filename:
+                            continue
+                    prop_components = [f"::script-name {script_name}"]
+                    for prop_class in prop_classes:
+                        key = (script_name, prop_class)
+                        if key in d:
+                            if chars := ''.join(d[key]):
+                                if prop_class in ('char',):
+                                    prop_components.append(f"::n-{prop_class} {len(chars)}")
+                                prop_components.append(f"::{prop_class} {chars}")
+                    f_out.write(f"{' '.join(prop_components)}\n")
+        sys.stderr.write(f"Rebuilt {out_filenames} with {n_script_refs} characters "
+                         f"for {len(d['script-names'])} scripts.\n")
+
+    def rebuild_num_props(self, out_filename: str, err_filename: str):
+        n_out, n_err = 0, 0
+        with open(out_filename, 'w') as f_out, open(err_filename, 'w') as f_err:
+            codepoint = -1
+            while codepoint < 0xF0000:
+                codepoint += 1
+                char = chr(codepoint)
+                num = first_non_none(ud_numeric(char),  # robust ud.numeric
+                                     self.num_value(char))  # uroman table includes extra num values, e.g. for Egyptian
+                if num is None:
+                    continue
+                result_dict = {}
+                orig_txt = char
+                value: int | float | None = None  # non-fraction-value(3 1/2) = 3
+                fraction: Fraction | None = None  # fraction(3 1/2) = Fraction(1, 2)
+                num_base = None  # num_base(500) = 100
+                base_multiplier = None  # base_multiplier(500) = 5
+                script = None
+                is_large_power = self.dict_bool[('is-large-power', char)]
+                # num_base is typically a power of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, ...
+                # exceptions might include 12 for the 'dozen' in popular English 'two dozen and one' (2*12+1=25)
+                # exceptions might include 20 for the 'score' in archaic English 'four score and seven' (4*20+7=87)
+                # exceptions might include 20 for the 'vingt' as in standard French 'quatre-vingt-treize' (4*20+13=93)
+                if script_name := self.chr_script_name(char):
+                    script = script_name
+                elif char in '0123456789':
+                    script = 'ascii-digit'
+                name = self.chr_name(char)
+                exclude_from_number_processing = False
+                for scrypt_type in ('SUPERSCRIPT', 'SUBSCRIPT',
+                                    'CIRCLED', 'PARENTHESIZED', 'SEGMENTED', 'MATHEMATICAL', 'ROMAN NUMERAL',
+                                    'FULL STOP', 'COMMA'):
+                    if scrypt_type in name:
+                        script = '*' + scrypt_type.lower().replace(' ', '-')
+                        exclude_from_number_processing = True
+                        break
+                for scrypt_type in ('VULGAR FRACTION',):
+                    if scrypt_type in name:
+                        script = scrypt_type.lower().replace(' ', '-')
+                        break
+                if exclude_from_number_processing:
+                    continue
+                if isinstance(num, int):
+                    value = num
+                    if 0 <= num <= 9:
+                        num_base = 1
+                        base_multiplier = num
+                        if "DIGIT" in name:
+                            num_type = 'digit'
+                        else:
+                            # Chinese numbers 零 (0), 一 (1), ... 九 (9) have numeric values,
+                            # but are NOT (full) digits
+                            num_type = 'digit-like'
+                    elif m := regex.match(r'([0-9]+?)(0*)$', str(num)):
+                        base_multiplier = int(m.group(1))  # non_base_value(500) = 5
+                        num_base = int('1' + m.group(2))
+                        num_type = 'base' if base_multiplier == 1 else 'multi'
+                    else:
+                        num_type = 'other-int'  # Do such cases exist?
+                elif ("FRACTION" in name) and (fraction := fraction_char2fraction(char, num, self)):
+                    fraction = fraction
+                    num_type = 'fraction'
+                else:
+                    num_type = 'other-num'  # Do such cases exist? Yes. Bengali currency numerators, ...
+                value_s = '' if value is None else str(value)
+                fraction_s = '' if fraction is None else f'{fraction.numerator}/{fraction.denominator}'
+                fraction_list = None if fraction is None else [fraction.numerator, fraction.denominator]
+                delimiter_s = ' ' if value_s and fraction_s else ''
+                rom = (value_s + delimiter_s + fraction_s) or orig_txt
+                add_non_none_to_dict(result_dict, 'txt', orig_txt)
+                add_non_none_to_dict(result_dict, 'rom', rom)
+                add_non_none_to_dict(result_dict, 'value', value)
+                add_non_none_to_dict(result_dict, 'fraction', fraction_list)
+                add_non_none_to_dict(result_dict, 'type', num_type)
+                if is_large_power:
+                    result_dict['is-large-power'] = True
+                add_non_none_to_dict(result_dict, 'base', num_base)
+                add_non_none_to_dict(result_dict, 'mult', base_multiplier)
+                add_non_none_to_dict(result_dict, 'script', script)
+                if num_type.startswith('other'):
+                    add_non_none_to_dict(result_dict, 'name', name)
+                    f_err.write(json.dumps(result_dict) + '\n')
+                    n_err += 1
+                else:
+                    if not script:
+                        add_non_none_to_dict(result_dict, 'name', name)
+                    f_out.write(json.dumps(result_dict) + '\n')
+                    n_out += 1
+        sys.stderr.write(f'Processed {codepoint} codepoints,\n  wrote {n_out} lines to {out_filename}\n'
+                         f'    and {n_err} lines to {err_filename}\n')
+
+    def load_resource_files(self, data_dir: Path, load_log: bool = False,
+                            rebuild_ud_props: bool = False, rebuild_num_props: bool = False):
+        """Loads all resource files needed for romanization."""
+        data_dir = data_dir
+        if not isinstance(data_dir, pathlib.Path):
+            sys.stderr.write(f'Error: data_dir is of {type(data_dir)}, not a Path.\n'
+                             f'       Cannot load any resource files.\n')
+            return
+        self.load_rom_file(os.path.join(data_dir, "romanization-auto-table.txt"),
+                           'ud', file_format='rom', load_log=load_log)
+        self.load_rom_file(os.path.join(data_dir, "UnicodeDataOverwrite.txt"),
+                           'ow', file_format='u2r', load_log=load_log)
+        self.load_rom_file(os.path.join(data_dir, "romanization-table.txt"),
+                           'man', file_format='rom', load_log=load_log)
+        self.load_chinese_pinyin_file(os.path.join(data_dir, "Chinese_to_Pinyin.txt"), load_log=load_log)
+        self.load_script_file(os.path.join(data_dir, "Scripts.txt"), load_log=load_log)
+        self.load_num_props(os.path.join(data_dir, "NumProps.jsonl"), load_log=load_log)
+        for base_file in ("UnicodeDataProps.txt", "UnicodeDataPropsCJK.txt", "UnicodeDataPropsHangul.txt"):
+            self.load_unicode_data_props(os.path.join(data_dir, base_file), load_log=load_log)
+        if rebuild_ud_props:
+            self.rebuild_unicode_data_props(os.path.join(data_dir, "UnicodeDataProps.txt"),
+                                            cjk=os.path.join(data_dir, "UnicodeDataPropsCJK.txt"),
+                                            hangul=os.path.join(data_dir, "UnicodeDataPropsHangul.txt"))
+        if rebuild_num_props:
+            self.rebuild_num_props(os.path.join(data_dir, "NumProps.jsonl"),
+                                   os.path.join(data_dir, "NumPropsRejects.jsonl"))
+
+    def unicode_hangul_romanization(self, s: str, pass_through_p: bool = False):
+        """Special algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet."""
+        if cached_rom := self.hangul_rom.get(s, None):
+            return cached_rom
+        leads = "g gg n d dd r m b bb s ss - j jj c k t p h".split()
+        vowels = "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i".split()
+        tails = "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h".split()
+        result = ""
+        for c in s:
+            cp = ord(c)
+            if 0xAC00 <= cp <= 0xD7A3:
+                code = cp - 0xAC00
+                lead_index = int(code / (28 * 21))
+                vowel_index = int(code / 28) % 21
+                tail_index = code % 28
+                rom = leads[lead_index] + vowels[vowel_index] + tails[tail_index]
+                rom = rom.replace('-', '')
+                self.hangul_rom[c] = rom
+                result += rom
+            elif pass_through_p:
+                result += c
+        return result
+
+    @staticmethod
+    def char_is_nonspacing_mark(s) -> bool:
+        """ Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signs"""
+        return (len(s) == 1) and (ud.category(s) == 'Mn')
+
+    @staticmethod
+    def char_is_format_char(s) -> bool:
+        """ Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joiner"""
+        return (len(s) == 1) and (ud.category(s) == 'Cf')
+
+    @staticmethod
+    def char_is_space_separator(s) -> bool:
+        """ Checks whether a character is a space,
+            e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark
+            but excluding \t, \r, \n"""
+        return (len(s) == 1) and (ud.category(s) == 'Zs')
+
+    def chr_name(self, char: str) -> str:
+        try:
+            return ud.name(char)
+        except (ValueError, TypeError):
+            if name := self.dict_str[('name', char)]:
+                return name
+        return ''
+
+    def num_value(self, s: str) -> int | float | Fraction | None:
+        """rom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numerals"""
+        for rom_rule in self.rom_rules[s]:
+            if (num := rom_rule['num']) is not None:
+                return num
+        return None
+
+    def rom_rule_value(self, s: str, key: str):
+        for rom_rule in self.rom_rules[s]:
+            if (value := rom_rule.get(key)) is not None:
+                return value
+        return None
+
+    def unicode_float2fraction(self, num: float, precision: float = 0.000001) -> Tuple[int, int] | None:
+        """only for common unicode fractions"""
+        if chached_value := self.float2fraction.get(num, None):
+            return chached_value
+        for numerator in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11):
+            for denominator in (2, 3, 4, 5, 6, 8, 12, 16, 20, 32, 40, 64, 80, 160, 320):
+                if abs(numerator / denominator - num) < precision:
+                    result = numerator, denominator
+                    self.float2fraction[num] = result
+                    return result
+        return None
+
+    def chr_script_name(self, char: str) -> str:
+        """For letters, diacritics, numerals etc."""
+        return self.dict_str[('script', char)]
+
+    def test_output_of_selected_scripts_and_rom_rules(self):
+        """Low level test function that checks and displays romanization information."""
+        output = ''
+        for s in ("Oriya", "Chinese"):
+            d = self.scripts[s.lower()]
+            output += f'SCRIPT {s} {d}\n'
+        for s in ('ƿ', 'β', 'и', 'μπ', '⠹', '亿', 'ちょ', 'и', '𓍧', '正', '分之', 'ऽ', 'ศ', 'ด์'):
+            d = self.rom_rules[s]
+            output += f'DICT {s} {d}\n'
+        for s in ('ƿ', 'β', 'न', 'ु'):
+            output += f'SCRIPT-NAME {s} {self.chr_script_name(s)}\n'
+        for s in ('万', '\uF8F7', '\U00013368', '\U0001308B', '\u0E48', '\u0E40'):
+            name = self.chr_name(s)
+            num = self.dict_num[s]
+            pic = self.dict_str[('pic', s)]
+            tone_mark = self.dict_str[('tone-mark', s)]
+            syllable_info = self.dict_str[('syllable-info', s)]
+            is_large_power = self.dict_bool[('is-large-power', s)]
+            output += f'PROPS {s}'
+            if name:
+                output += f'  name: {name}'
+            if num:
+                output += f'  num: {num} ({type(num).__name__})'
+            if pic:
+                output += f'  pic: {pic}'
+            if tone_mark:
+                output += f'  tone-mark: {tone_mark}'
+            if syllable_info:
+                output += f'  syllable-info: {syllable_info}'
+            if is_large_power:
+                output += f'  is-large-power: {is_large_power}'
+            output += '\n'
+        mayan12 = '\U0001D2EC'
+        egyptian600 = '𓍧'
+        runic90 = '𐍁'
+        klingon2 = '\uF8F2'
+        for offset, c in enumerate(f'9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵{runic90}{mayan12}{egyptian600}{klingon2}'):
+            output += f'NUM-EDGE: {NumEdge(offset, offset+1, c, self)}\n'
+        for s in ('\u00bc', '\u0968'):
+            output += f'NUM-PROPS: {self.num_props[s]}\n'
+        print(output)
+
+    def test_romanization(self, **args):
+        """A few full cases of romanization testing."""
+        tests = [('ألاسكا', None), ('यह एक अच्छा अनुवाद है.', 'hin'), ('ちょっとまってください', 'kor'),
+                 ('Μπανγκαλόρ', 'ell'), ('Зеленський', 'ukr'), ('കേരളം', 'mal')]
+        for test in tests:
+            s = test[0]
+            lcode = test[1] if len(test) >= 2 else None
+            rom = self.romanize_string(s, lcode=lcode, **args)
+            sys.stderr.write(f'ROM {s} -> {rom}\n')
+        n_alerts = 0
+        codepoint = -1
+        while codepoint < 0xF0000:
+            codepoint += 1
+            c = chr(codepoint)
+            rom = self.romanize_string(c)
+            if regex.search(r'\s', rom) and regex.search(r'\S', rom):
+                name = self.chr_name(c)
+                sys.stderr.write(f'U+{codepoint:04X} {c} {name}  {rom}\n')
+                n_alerts += 1
+        sys.stderr.write(f'{n_alerts} alerts for roms with spaces\n')
+
+    def romanize_file(self, input_filename: str | None = None, output_filename: str | None = None,
+                      lcode: str | None = None, direct_input: List[str] = None, **args):
+        """Script to apply romanization to an entire file. Input and output files needed.
+        Language code (lcode) recommended."""
+        f_in_to_be_closed, f_out_to_be_closed = False, False
+        if direct_input and (input_filename is None):
+            f_in = direct_input  # list of lines
+        elif isinstance(input_filename, str):
+            try:
+                f_in = open(input_filename)
+                f_in_to_be_closed = True
+            except OSError:
+                sys.stderr.write(f'Error in romanize_file: Cannot open file {input_filename}\n')
+                f_in = None
+        elif input_filename is None:
+            f_in = sys.stdin
+        else:
+            sys.stderr.write(f"Error in romanize_file: argument 'input_filename' {input_filename} "
+                             f"is of wrong type: {type(input_filename)} (should be str)\n")
+            f_in = None
+        if isinstance(output_filename, str):
+            try:
+                f_out = open(str(output_filename), 'w')
+                f_out_to_be_closed = True
+            except OSError:
+                sys.stderr.write(f'Error in romanize_file: Cannot write to file {output_filename}\n')
+                f_out = None
+        elif output_filename is None:
+            f_out = sys.stdout
+        else:
+            sys.stderr.write(f"Error in romanize_file: argument 'output_filename' {output_filename} "
+                             f"is of wrong type: {type(output_filename)} (should be str)\n")
+            f_out = None
+        if f_in and f_out:
+            max_lines = args.get('max_lines', None)
+            progress_dots_output = False
+            for line_number, line in enumerate(f_in, 1):
+                if m := regex.match(r'(::lcode\s+)([a-z]{3})(\s+)(.*?)\s*$', line):
+                    lcode_kw, lcode2, space, snt = m.group(1, 2, 3, 4)
+                    rom_result = self.romanize_string(snt, lcode2 or lcode, **args)
+                    if args.get('rom_format', None) == RomFormat.STR:
+                        lcode_prefix = f"{lcode_kw}{lcode2}{space}"
+                        f_out.write(lcode_prefix + rom_result + '\n')
+                    else:
+                        lcode_prefix = f'[0, 0, "", "lcode: {lcode2}"]'  # meta edge with lcode info
+                        prefixed_edges = [lcode_prefix] + self.romanize_string(snt, lcode2 or lcode, **args)
+                        f_out.write(Edge.json_str(prefixed_edges) + '\n')
+                else:
+                    f_out.write(Edge.json_str(self.romanize_string(line.rstrip(), lcode, **args)) + '\n')
+                if not args.get('silent'):
+                    if line_number % 100 == 0:
+                        if line_number % 1000 == 0:
+                            sys.stderr.write(str(line_number))
+                        else:
+                            sys.stderr.write('.')
+                        progress_dots_output = True
+                        sys.stderr.flush()
+                        gc.collect()
+                if max_lines and line_number >= max_lines:
+                    break
+            if progress_dots_output:
+                sys.stderr.write('\n')
+                sys.stderr.flush()
+        if f_in_to_be_closed:
+            f_in.close()
+        if f_out_to_be_closed:
+            f_out.close()
+
+    @staticmethod
+    def apply_any_offset_to_cached_rom_result(cached_rom_result: str | List[Edge], offset: int = 0) \
+            -> str | List[Edge]:
+        if isinstance(cached_rom_result, str):
+            return cached_rom_result
+        elif offset == 0:
+            return cached_rom_result
+        else:
+            return [Edge(edge.start + offset, edge.end + offset, edge.txt, edge.type) for edge in cached_rom_result]
+
+    def romanize_string_core(self, s: str, lcode: str | None, rom_format: RomFormat, cache_p: bool,
+                             offset: int = 0, **args) -> str | List[Edge]:
+        """Script to support token-by-token romanization with caching for higher speed."""
+        if cache_p:
+            cached_rom = self.rom_cache.get((s, lcode, rom_format), None)
+            if cached_rom is not None:
+                return self.apply_any_offset_to_cached_rom_result(cached_rom, offset)
+        lat = Lattice(s, uroman=self, lcode=lcode)
+        lat.pick_tibetan_vowel_edge(**args)
+        lat.prep_braille(**args)
+        lat.add_romanization(**args)
+        lat.add_numbers(self, **args)
+        lat.add_braille_numbers(**args)
+        lat.add_rom_fall_back_singles(**args)
+        if rom_format == RomFormat.LATTICE:
+            all_edges = lat.all_edges(0, len(s))
+            lat.add_alternatives(all_edges)
+            if cache_p:
+                self.rom_cache[(s, lcode, rom_format)] = all_edges
+            result = self.apply_any_offset_to_cached_rom_result(all_edges, offset)
+        else:
+            best_edges = lat.best_rom_edge_path(0, len(s))
+            if rom_format in (RomFormat.EDGES, RomFormat.ALTS):
+                if rom_format == RomFormat.ALTS:
+                    lat.add_alternatives(best_edges)
+                if cache_p:
+                    self.rom_cache[(s, lcode, rom_format)] = best_edges
+                result = self.apply_any_offset_to_cached_rom_result(best_edges, offset)
+            else:
+                rom = lat.edge_path_to_surf(best_edges)
+                del lat
+                if cache_p:
+                    self.rom_cache[(s, lcode, rom_format)] = rom
+                result = rom
+        return result
+
+    def romanize_string(self, s: str, lcode: str | None = None, rom_format: RomFormat = RomFormat.STR, **args) \
+            -> str | List[Edge]:
+        """Main entry point for romanizing a string. Recommended argument: lcode (language code).
+        recursive only used for development.
+        Method returns a string or a list of edges (with start and end offsets)."""
+        lcode = lcode or args.get('lcode', None)
+        # print('rom::', s, 'lcode:', lcode, 'print-lattice:', print_lattice_p)
+
+        # with caching (for string format output only for now)
+        if cache_p := not args.get('no_caching', False):
+            rest, offset = s, 0
+            result = '' if rom_format == RomFormat.STR else []
+            while m3 := regex.match(r'(.*?)([.,; ]*[ 。][.,; ]*)(.*)$', rest):
+                pre, delimiter, rest = m3.group(1, 2, 3)
+                result += self.romanize_string_core(pre, lcode, rom_format, cache_p, offset, **args)
+                offset += len(pre)
+                result += self.romanize_string_core(delimiter, lcode, rom_format, cache_p, offset, **args)
+                offset += len(delimiter)
+            result += self.romanize_string_core(rest, lcode, rom_format, cache_p, offset, **args)
+            return result
+        else:
+            return self.romanize_string_core(s, lcode, rom_format, cache_p, 0, **args)
+
+
+class Edge:
+    """This class defines edges that span part of a sentence with a specific romanization.
+    There might be multiple edges for a given span. The edges in turn are part of the
+    romanization lattice."""
+    def __init__(self, start: int, end: int, s: str, annotation: str = None):
+        self.start = start
+        self.end = end
+        self.txt = s
+        self.type = annotation
+
+    def __str__(self):
+        return f'[{self.start}-{self.end}] {self.txt} ({self.type})'
+
+    def __repr__(self):
+        return str(self)
+
+    def json(self) -> str:  # start - end - text - annotation
+        return json.dumps([self.start, self.end, self.txt, self.type])
+
+    @staticmethod
+    def json_str(rom_result: List[Edge] | str) -> str:
+        if isinstance(rom_result, str):
+            return rom_result
+        else:
+            result = '['
+            for edge in rom_result:
+                if isinstance(edge, Edge):
+                    result += edge.json()
+                else:
+                    result += str(edge)
+            result += ']'
+            return result
+
+
+class NumEdge(Edge):
+    def __init__(self, start: int, end: int, s: str, uroman: Uroman | None, active: bool = False):
+        """For NumEdge, the s argument is in original language (not yet romanized)."""
+        # For speed, much of this processing should at some point be cached in data files.
+        Edge.__init__(self, start, end, s)
+        self.orig_txt, self.txt = s, s
+        self.value, self.fraction, self.num_base, self.base_multiplier = None, None, None, None
+        self.type, self.script, self.is_large_power, self.active = None, None, False, active
+        self.n_decimals = None
+        self.value_s = None     # precision for 3.14159265358979323846264338327950288419716939937510582097494
+        if start+1 == end:
+            char = s[0]
+            if d := uroman.num_props.get(char):
+                self.active = True
+                self.value = d.get('value')
+                fraction_list = d.get('fraction')
+                self.fraction = Fraction(fraction_list[0], fraction_list[1]) if fraction_list else None
+                self.num_base = d.get('base')
+                self.base_multiplier = d.get('mult')
+                self.type = d.get('type')
+                self.script = d.get('script')
+                self.is_large_power = d.get('is-large-power')
+                self.update()
+
+    def update(self,
+               value: int | float | None = None,
+               value_s: str | None = None,
+               fraction: Fraction | None = None,
+               n_decimals: int | None = None,
+               num_base: int | None = None,
+               base_multiplier: int | float | None = None,
+               script: str | None = None,
+               e_type: str | None = None,
+               orig_txt: str | None = None) -> str:
+        self.value = first_non_none(value, self.value)
+        self.value_s = first_non_none(value_s, self.value_s)
+        self.fraction = first_non_none(fraction, self.fraction)
+        self.n_decimals = first_non_none(n_decimals, self.n_decimals)
+        self.num_base = first_non_none(num_base, self.num_base)
+        self.base_multiplier = first_non_none(base_multiplier, self.base_multiplier)
+        self.script = first_non_none(script, self.script)
+        self.type = first_non_none(e_type, self.type)
+        self.orig_txt = first_non_none(orig_txt, self.orig_txt)
+        if self.value_s is not None:
+            value_s = self.value_s
+        elif self.value is None:
+            value_s = ''
+        elif isinstance(self.value, float) and (self.n_decimals is not None):
+            value_s = first_non_none(self.value_s, f'{self.value:0.{self.n_decimals}f}')
+        else:
+            value_s = str(self.value)
+        fraction_s = '' if self.fraction is None else f'{self.fraction.numerator}/{self.fraction.denominator}'
+        delimiter_s = ' ' if value_s and fraction_s else ''
+        self.txt = (value_s + delimiter_s + fraction_s) or self.orig_txt
+        return self.txt
+
+    def __str__(self):
+        if self.num_base is not None:
+            if self.base_multiplier is not None:
+                b_clause = f'{self.base_multiplier}*{self.num_base}'
+            else:
+                b_clause = str(self.num_base)
+        else:
+            b_clause = None
+        return (('' if self.active else ' *')
+                + f'[{self.start}-{self.end}] {self.orig_txt} R:{self.txt} T:{self.type}'
+                + (' LP' if self.is_large_power else '')
+                + (f' B:{b_clause}' if (b_clause is not None) else '')
+                + (f' V:{self.value}' if ((self.value is not None) and (str(self.value) != self.txt)) else '')
+                + (f' VS:{self.value_s}' if ((self.value_s is not None) and (self.value_s != self.txt)) else '')
+                + (f' F:.{self.n_decimals}f' if self.n_decimals else f'')
+                + (f' S:{self.script}' if self.script else ''))
+
+
+class Lattice:
+    """Lattice for a specific romanization instance. Has edges."""
+    def __init__(self, s: str, uroman: Uroman, lcode: str = None):
+        self.s = s
+        self.lcode = lcode
+        self.lattice = defaultdict(set)
+        self.max_vertex = len(s)
+        self.uroman = uroman
+        self.props = {}
+        self.simple_top_rom_cache = {}
+        self.contains_script = defaultdict(bool)
+        self.check_for_scripts()
+
+    def check_for_scripts(self):
+        for c in self.s:
+            script_name = self.uroman.chr_script_name(c)
+            self.contains_script[script_name] = True
+            if regex.search(r'[\u2800-\u28FF]', self.s):
+                self.contains_script['Braille'] = True
+
+    def add_edge(self, edge: Edge):
+        self.lattice[(edge.start, edge.end)].add(edge)
+        self.lattice[(edge.start, 'right')].add(edge.end)
+        self.lattice[(edge.end, 'left')].add(edge.start)
+
+    def __str__(self):
+        edges = []
+        for start in range(self.max_vertex):
+            for end in self.lattice[(start, 'right')]:
+                for edge in self.lattice[(start, end)]:
+                    edges.append(f'[{start}-{end}] {edge.txt} ({edge.type})')
+        return ' '.join(edges)
+
+    @staticmethod
+    def char_is_braille(c: str) -> bool:
+        return 0x2800 <= ord(c[0]) <= 0x28FF
+
+    # Help Tibet
+    def char_is_subjoined_letter(self, c: str) -> bool:
+        return "SUBJOINED LETTER" in self.uroman.chr_name(c)
+
+    def char_is_regular_letter(self, c: str) -> bool:
+        char_name = self.uroman.chr_name(c)
+        return ("LETTER" in char_name) and not ("SUBJOINED" in char_name)
+
+    def char_is_letter(self, c: str) -> bool:
+        return "LETTER" in self.uroman.chr_name(c)
+
+    def char_is_vowel_sign(self, c: str) -> bool:
+        return self.uroman.dict_bool[('is-vowel-sign', c)]
+
+    def char_is_letter_or_vowel_sign(self, c: str) -> bool:
+        return self.char_is_letter(c) or self.char_is_vowel_sign(c)
+
+    def is_at_start_of_word(self, position: int) -> bool:
+        # return not regex.match(r'(?:\pL|\pM)', self.s[position-1:position])
+        first_char = self.s[position]
+        first_char_is_braille = self.char_is_braille(first_char)
+        end = position
+        if (preceded_by_alpha := self.props.get(('preceded_by_alpha', end), None)) in (True, False):
+            return not preceded_by_alpha
+        for start in self.lattice[(end, 'left')]:
+            for edge in self.lattice[(start, end)]:
+                prev_letter = None if edge.txt == '' else edge.txt[-1]
+                if len(edge.txt) and (prev_letter.isalpha() or (first_char_is_braille and (prev_letter in ["'"]))):
+                    self.props[('preceded_by_alpha', position)] = True
+                    return False
+        self.props[('preceded_by_alpha', position)] = False
+        return True
+
+    def is_at_end_of_word(self, position: int) -> bool:
+        if (cached_followed_by_alpha := self.props.get(('followed_by_alpha', position), None)) in (True, False):
+            return not cached_followed_by_alpha
+        start = position
+        while (start+1 < self.max_vertex) \
+                and self.uroman.char_is_nonspacing_mark(self.s[start]) \
+                and ('NUKTA' in self.uroman.chr_name(self.s[start])):
+            start += 1
+        for end in range(start + 1, self.max_vertex + 1):
+            s = self.s[start:end]
+            if not self.uroman.dict_bool[('s-prefix', s)]:
+                break
+            for rom_rule in self.uroman.rom_rules[s]:
+                rom = rom_rule['t']
+                if (not rom_rule['use-only-at-start-of-word']) and regex.search(r'\pL', rom):
+                    self.props[('followed_by_alpha', position)] = True
+                    return False
+        self.props[('followed_by_alpha', position)] = False
+        return True
+
+    def is_at_end_of_syllable(self, position: int) -> Tuple[bool, str]:
+        """At least initially for Thai"""
+        prev_char = self.s[position-2] if position >= 2 else None
+        # char = self.s[position-1] if position >= 1 else None
+        next_char = self.s[position] if position < self.max_vertex else None
+        if self.uroman.dict_str[('tone-mark', next_char)]:
+            adj_position = position + 1
+            next_char = self.s[adj_position] if adj_position < self.max_vertex else None
+            # print('TONE-MARK', position, next_char)
+        else:
+            adj_position = position
+        next_char2 = self.s[adj_position + 1] if adj_position + 1 < self.max_vertex else None
+        if prev_char is None:
+            return False, 'start-of-string'
+        if not regex.search(r'(?:\pL|\pM)$', prev_char):  # start of token
+            return False, 'start-of-token'
+        if self.uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
+            return False, 'pre-post-vowel-on-left'
+        if self.uroman.dict_str[('syllable-info', next_char)] == 'written-pre-consonant-spoken-post-consonant':
+            return True, 'pre-post-vowel-on-right'
+        if adj_position >= self.max_vertex:  # end of string
+            return True, 'end-of-string'
+        # if not self.char_is_letter_or_vowel_sign(next_char):  # end of token
+        if not regex.match(r'(?:\pL|\pM)', next_char):  # end of token
+            return True, 'end-of-token'
+        if position > 0:
+            left_edge = self.best_left_neighbor_edge(position-1)
+            if left_edge and regex.search(r'[bcdfghjklmnpqrstvxz]$', left_edge.txt):
+                return False, 'consonant-to-the-left'
+        next_char_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position,
+                                                                                       adj_position + 2,
+                                                                                       simple_search=True),
+                                       self.simple_top_romanization_candidate_for_span(adj_position,
+                                                                                       adj_position + 1,
+                                                                                       simple_search=True),
+                                       "?")
+        if not regex.match(r"[aeiou]", next_char_rom.lower()):  # followed by consonant
+            return True, f'not-followed-by-vowel {next_char_rom}'
+        if (next_char == '\u0E2D') and (next_char2 is not None):  # THAI CHARACTER O ANG
+            next_char2_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position+1,
+                                                                                            adj_position+2,
+                                                                                            simple_search=True),
+                                            "?")
+            if regex.match(r"[aeiou]", next_char2_rom.lower()):
+                return True, 'o-ang-followed-by-vowel'  # In that context Thai char. "o ang" is considered a consonant
+        return False, 'not-at-syllable-end-by-default'
+
+    def romanization_by_first_rule(self, s) -> str | None:
+        try:
+            return self.uroman.rom_rules[s][0]['t']
+        except IndexError:
+            return None
+
+    def expand_rom_with_special_chars(self, rom: str, start: int, end: int, **args) \
+            -> Tuple[str, int, int, str | None]:
+        """This method contains a number of special romanization heuristics that typically modify
+        an existing or preliminary edge based on context."""
+        orig_start = start
+        uroman = self.uroman
+        full_string = self.s
+        annot = None
+        if rom == '':
+            return rom, start, end, None
+        prev_char = (full_string[start-1] if start >= 1 else '')
+        first_char = full_string[start]
+        last_char = full_string[end-1]
+        next_char = (full_string[end] if end < len(full_string) else '')
+        # \u2820 is the Braille character indicating that the next letter is upper case
+        if (prev_char == '\u2820') and regex.match(r'[a-z]', rom):
+            return rom[0].upper() + rom[1:], start-1, end, 'rom exp'
+        # Normalize multi-upper case THessalonike -> Thessalonike, but don't change THESSALONIKE
+        if start+1 == end and rom.isupper() and next_char.islower():
+            ablation = args.get('ablation', '')     # VERBOSE
+            if not ('nocap' in ablation):
+                rom = rom.capitalize()
+        # Japanese small tsu (and Gurmukhi addak) used as consonant doubler:
+        if (prev_char and prev_char in 'っッ\u0A71') \
+                and (uroman.chr_script_name(prev_char) == uroman.chr_script_name(prev_char)) \
+                and (m_double_consonant := regex.match(r'(ch|[bcdfghjklmnpqrstwz])', rom)):
+            # return m_double_consonant.group(1).replace('ch', 't') + rom, start-1, end, 'rom exp'
+            # expansion might additional apply to the right
+            if prev_char in 'っッ':  # for Japanese, per Hepburn, use tch
+                rom = m_double_consonant.group(1).replace('ch', 't') + rom
+            else:
+                rom = m_double_consonant.group(1).replace('ch', 'c') + rom
+            start = start-1
+            first_char = full_string[start]
+            prev_char = (full_string[start-1] if start >= 1 else '')
+        # Thai
+        if uroman.chr_script_name(first_char) == 'Thai':
+            if (start+1 == end) and regex.match(r'[bcdfghjklmnpqrstvwxyz]+$', rom):
+                if uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
+                    for vowel_prefix_len in [1]:
+                        if vowel_prefix_len <= start:
+                            for vowel_suffix_len in [3, 2, 1]:
+                                if end + vowel_suffix_len <= len(full_string):
+                                    pattern = (full_string[start-vowel_prefix_len: start]
+                                               + '–'
+                                               + full_string[end:end+vowel_suffix_len])
+                                    if uroman.rom_rules[pattern]:
+                                        vowel_rom_rule = uroman.rom_rules[pattern][0]
+                                        vowel_rom = vowel_rom_rule['t']
+                                        # print(f" PATTERN {pattern} ({full_string[start:end]}/{rom}) {rom}{vowel_rom}")
+                                        return rom + vowel_rom, start-vowel_prefix_len, end+vowel_suffix_len, 'rom exp'
+            if (uroman.chr_script_name(prev_char) == 'Thai') \
+                    and (uroman.dict_str[('syllable-info', prev_char)]
+                         == 'written-pre-consonant-spoken-post-consonant') \
+                    and regex.match(r'[bcdfghjklmnpqrstvwxyz]', rom) \
+                    and (vowel_rom := self.romanization_by_first_rule(prev_char)):
+                return rom + vowel_rom, start-1, end, 'rom exp'
+            # THAI CHARACTER O ANG
+            if (first_char == '\u0E2D') and (end - start == 1):
+                prev_script = uroman.chr_script_name(prev_char)
+                next_script = uroman.chr_script_name(next_char)
+                prev_rom = self.find_rom_edge_path_backwards(0, start, 1, return_str=True)
+                next_rom = self.romanization_by_first_rule(next_char)
+                # if not recursive:
+                #     lc = uroman.romanize_string(full_string[:start], lcode=self.lcode, recursive=True)
+                #     rc = uroman.romanize_string(full_string[end:], lcode=self.lcode, recursive=True)
+                #     print('PP', start, end, prev_script, next_script, prev_rom, next_rom, '  LC:', lc[-40:],
+                #           '  RC:', rc[:40])
+                # delete THAI CHARACTER O ANG unless it is surrounded on both sides by a Thai consonant
+                if not ((prev_script == 'Thai') and (next_script == 'Thai')
+                        and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', prev_rom)
+                        and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', next_rom)):
+                    # if not recursive:
+                    #     print(f'* DELETE O ANG {first_char} {start}-{end}   LC: {lc[-40:]}  RC: {rc[:40]}')
+                    return '', start, end, 'rom del'
+        # Coptic: consonant + grace-accent = e + consonant
+        if next_char and (next_char == "\u0300") and (uroman.chr_script_name(last_char) == "Coptic")\
+                and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)):
+            rom = 'e' + rom
+            end = end+1
+            last_char = full_string[end - 1]
+            next_char = (full_string[end] if end < len(full_string) else '')
+            annot = 'rom exp'
+        # Japanese small y: ki + small ya = kya etc.
+        if (next_char and next_char in 'ゃゅょャュョ') \
+                and (uroman.chr_script_name(last_char) == uroman.chr_script_name(next_char)) \
+                and regex.search(r'([bcdfghjklmnpqrstvwxyz]i$)', rom) \
+                and (y_rom := self.romanization_by_first_rule(next_char)) \
+                and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)) \
+                and (not self.simple_top_romanization_candidate_for_span(start, end+1)):
+            rom = rom[:-1] + y_rom
+            end = end+1
+            last_char = full_string[end - 1]
+            next_char = (full_string[end] if end < len(full_string) else '')
+            annot = 'rom exp'
+        # Japanese vowel lengthener (U+30FC)
+        last_rom_char = last_chr(rom)
+        if (next_char == 'ー') \
+                and (uroman.chr_script_name(last_char) in ('Hiragana', 'Katakana')) \
+                and (last_rom_char in 'aeiou'):
+            return rom + last_rom_char, start, end+1, 'rom exp'
+        # Virama (in Indian languages)
+        if self.uroman.dict_bool[('is-virama', next_char)]:
+            return rom, start, end + 1, "rom exp"
+        if rom.startswith(' ') and ((start == 0) or (prev_char == ' ')):
+            rom = rom[1:]
+        if rom.endswith(' ') and ((end == len(full_string)+1) or (next_char == ' ')):
+            rom = rom[:-1]
+        return rom, start, end, annot
+
+    def prep_braille(self, **_args) -> None:
+        if self.contains_script['Braille']:
+            dots6 = '\u2820'  # characters in following word are upper case
+            all_caps = False
+            for i, c in enumerate(self.s):
+                if (i >= 1) and (self.s[i-1] == dots6) and (c == dots6):
+                    all_caps = True
+                elif all_caps:
+                    if c in '\u2800':  # Braille space
+                        all_caps = False
+                    else:
+                        self.props[('is-upper', i)] = True
+
+    def pick_tibetan_vowel_edge(self, **args) -> None:
+        if not self.contains_script['Tibetan']:
+            return None
+        verbose = bool(args.get('verbose'))
+        s = self.s
+        uroman = self.uroman
+        tibetan_syllable = []
+        tibetan_letter_positions = []
+        for start in range(self.max_vertex):
+            c = s[start]
+            if (uroman.chr_script_name(c) == 'Tibetan') and self.char_is_letter_or_vowel_sign(c):
+                tibetan_letter_positions.append(start)
+            else:
+                if tibetan_letter_positions:
+                    tibetan_syllable.append(tibetan_letter_positions)
+                    tibetan_letter_positions = []
+        if tibetan_letter_positions:
+            tibetan_syllable.append(tibetan_letter_positions)
+        for tibetan_letter_positions in tibetan_syllable:
+            vowel_pos = None
+            orig_txt = ''
+            roms = []
+            subjoined_letter_positions = []
+            first_letter_position = tibetan_letter_positions[0]
+            for i in tibetan_letter_positions:
+                c = s[i]
+                orig_txt += c
+                rom = first_non_none(self.simple_top_romanization_candidate_for_span(i, i+1), "?")
+                self.props[('edge-vowel', i)] = None
+                if self.char_is_vowel_sign(c) or (rom and regex.match(r"[aeiou]+$", rom)):
+                    vowel_pos = i
+                    self.props[('edge-vowel', i)] = True
+                    # delete any syllable initial ' before vowel
+                    if roms == ["'"]:
+                        self.props[('edge-delete', i-1)] = True
+                elif self.char_is_subjoined_letter(c):
+                    subjoined_letter_positions.append(i)
+                    if i > first_letter_position:
+                        if c == "\u0FB0":
+                            vowel_pos = i-1
+                            self.props[('edge-vowel', i-1)] = True
+                        else:
+                            self.props[('edge-vowel', i-1)] = False
+                    rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
+                elif c == "\u0F60":  # Tibetan letter -a (')
+                    self.props[('edge-vowel', i)] = False
+                    if i > first_letter_position:
+                        vowel_pos = i-1
+                        self.props[('edge-vowel', i-1)] = True
+                        if i == tibetan_letter_positions[-1]:
+                            self.props[('edge-delete', i)] = True
+                    if roms and not (roms[-1] in "aeiou"):
+                        rom = "a'"
+                    else:
+                        rom = "'"
+                else:
+                    rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
+                roms.append(rom)
+            if vowel_pos is not None:
+                for i in tibetan_letter_positions:
+                    if self.props.get(('edge-vowel', i)) is None:
+                        self.props[('edge-vowel', i)] = False
+            else:
+                best_cost, best_vowel_pos, best_pre, best_post = math.inf, None, None, None
+                n_letters = len(tibetan_letter_positions)
+                for i in tibetan_letter_positions:
+                    rel_pos = i - first_letter_position
+                    pre, post = ''.join(roms[:rel_pos+1]), ''.join(roms[rel_pos+1:])
+                    if self.props.get(('edge-vowel', i)) is False:
+                        cost = 20
+                        if cost < best_cost:
+                            best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
+                    elif n_letters == 1:
+                        cost = 0
+                        if cost < best_cost:
+                            best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
+                    elif n_letters == 2:
+                        cost = 0 if i == 0 else 0.1
+                        if cost < best_cost:
+                            best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
+                    else:
+                        good_suffix = regex.match(r"(?:|[bcdfghjklmnpqrstvwxz]|bh|bs|ch|cs|dd|ddh|"
+                                                  r"dh|dz|dzh|gh|gr|gs|kh|khs|kss|n|nn|nt|ms|ng|ngs|ns|ph|"
+                                                  r"rm|sh|ss|th|ts|tsh|tt|tth|zh|zhs)'?$", post)
+                        good_prefix = regex.match(r"'?(?:.|bd|br|brg|brgy|bs|bsh|bst|bt|bts|by|bz|bzh|"
+                                                  r"ch|db|dby|dk|dm|dp|dpy|dr|"
+                                                  r"gl|gn|gr|gs|gt|gy|gzh|kh|khr|khy|kr|ky|ld|lh|lt|mkh|mny|mth|mtsh|"
+                                                  r"ny|ph|phr|phy|rgy|rk|el|rn|rny|rt|rts|"
+                                                  r"sk|skr|sky|sl|sm|sn|sny|sp|spy|sr|st|th|ts|tsh)$", pre)
+                        subjoined_suffix = all([x in subjoined_letter_positions
+                                                for x in tibetan_letter_positions[rel_pos+2:]])
+                        # print('GOOD', good_suffix, good_prefix, subjoined_suffix, f'{pre}a{post}',
+                        #       subjoined_letter_positions, tibetan_letter_positions[rel_pos+2:])
+                        if good_suffix and good_prefix:
+                            cost = len(pre) * 0.1
+                        elif good_suffix:
+                            cost = len(pre)
+                        elif subjoined_suffix and good_prefix:
+                            cost = len(pre) * 0.3
+                        elif subjoined_suffix:
+                            cost = len(pre) * 0.5
+                        else:
+                            cost = math.inf
+                    if cost < best_cost:
+                        best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
+                if best_vowel_pos is not None:
+                    for i in tibetan_letter_positions:
+                        if self.props.get(('edge-vowel', i)) is None:
+                            value = (i == best_vowel_pos)
+                            self.props[('edge-vowel', i)] = value
+                if verbose:
+                    best_cost = best_cost if isinstance(best_cost, int) else round(best_cost, 2)
+                    sys.stderr.write(f'Tib. best cost: "{best_pre}a{best_post}"  o:{orig_txt}  c:{round(best_cost, 2)}'
+                                     f'   p:{best_vowel_pos} {tibetan_letter_positions}\n')
+
+    def add_default_abugida_vowel(self, rom: str, start: int, end: int, annotation: str = '') -> str:
+        """Adds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia."""
+        uroman = self.uroman
+        s = self.s
+        try:
+            first_s_char = s[start]
+            last_s_char = s[end-1]
+            script_name = uroman.chr_script_name(first_s_char)
+            script = self.uroman.scripts[script_name.lower()]
+            if not (abugida_default_vowels := script['abugida-default-vowels']):
+                return rom
+            key = (script, rom)
+            if key in uroman.abugida_cache:
+                base_rom, base_rom_plus_vowel, mod_rom = uroman.abugida_cache[key]
+                rom = mod_rom
+            else:
+                vowels_regex1 = '|'.join(abugida_default_vowels)   # e.g. 'a' or 'a|o'
+                vowels_regex2 = '|'.join(map(lambda x: x + '+', abugida_default_vowels))   # e.g. 'a+' or 'a+|o+'
+                if m := regex.match(fr'([cfghkmnqrstxy]?y)({vowels_regex2})-?$', rom):
+                    base_rom = m.group(1)
+                    base_rom_plus_vowel = base_rom + m.group(2)
+                elif m := regex.match(fr'([bcdfghjklmnpqrstvwxyz]+)({vowels_regex1})-?$', rom):
+                    base_rom = m.group(1)
+                    base_rom_plus_vowel = base_rom + m.group(2)
+                    if rom.endswith('-') and (start+1 == end) and rom[0].isalpha():
+                        rom = rom[:-1]
+                else:
+                    base_rom = rom
+                    base_rom_plus_vowel = base_rom + abugida_default_vowels[0]
+                if (not regex.match(r"[bcdfghjklmnpqrstvwxyz]+$", base_rom)
+                        and (not ((script_name == 'Tibetan') and (base_rom == "'")))):
+                    base_rom, base_rom_plus_vowel = None, None
+                uroman.abugida_cache[key] = (base_rom, base_rom_plus_vowel, rom)
+            if base_rom is None:
+                return rom
+            if 'tail' in annotation:
+                return rom
+            prev_s_char = s[start-1] if start >= 1 else ''
+            next_s_char = s[end] if len(s) > end else ''
+            next2_s_char = s[end+1] if len(s) > end+1 else ''
+            if script_name == 'Tibetan':
+                if self.props.get(('edge-delete', start)):
+                    return ''
+                elif self.props.get(('edge-vowel', start)):
+                    return base_rom_plus_vowel
+                else:
+                    return base_rom
+            if (next_s_char and ((base_rom in "bcdfghklmnpqrstvwz") or (base_rom in ["ng"]))
+                    and (next_s_char in "យ")):  # Khmer yo
+                return base_rom
+            if self.uroman.dict_bool[('is-vowel-sign', next_s_char)]:
+                return base_rom
+            if self.uroman.dict_bool[('is-medial-consonant-sign', next_s_char)]:
+                return base_rom
+            if self.char_is_subjoined_letter(next_s_char):
+                return base_rom
+            if self.uroman.char_is_nonspacing_mark(next_s_char) \
+                    and self.uroman.dict_bool[('is-vowel-sign', next2_s_char)]:
+                return base_rom
+            if self.uroman.dict_bool[('is-virama', next_s_char)]:
+                return base_rom
+            if self.uroman.char_is_nonspacing_mark(next_s_char) \
+                    and self.uroman.dict_bool[('is-virama', next2_s_char)]:
+                return base_rom
+            if self.uroman.dict_bool[('is-virama', prev_s_char)]:
+                return base_rom_plus_vowel
+            if self.is_at_start_of_word(start) and not regex.search('r[aeiou]', rom):
+                return base_rom_plus_vowel
+            # delete many final schwas from most Devanagari languages (except: Sanskrit)
+            if self.is_at_end_of_word(end):
+                if (script_name in ("Devanagari",)) and (self.lcode not in ('san',)):  # Sanskrit
+                    return rom
+                else:
+                    return base_rom_plus_vowel
+            if uroman.chr_script_name(prev_s_char) != script_name:
+                return base_rom_plus_vowel
+            if 'VOCALIC' in self.uroman.chr_name(last_s_char):
+                return base_rom
+            if uroman.chr_script_name(next_s_char) == script_name:
+                return base_rom_plus_vowel
+        except Exception:
+            return rom
+        else:
+            pass
+            # print('ABUGIDA', rom, start, script_name, script, abugida_default_vowels, prev_s_char, next_s_char)
+        return rom
+
+    def cand_is_valid(self, rom_rule: RomRule, start: int, end: int, rom: str) -> bool:
+        if rom is None:
+            return False
+        if rom_rule['dont-use-at-start-of-word'] and self.is_at_start_of_word(start):
+            return False
+        if rom_rule['use-only-at-start-of-word'] and not self.is_at_start_of_word(start):
+            return False
+        if rom_rule['dont-use-at-end-of-word'] and self.is_at_end_of_word(end):
+            return False
+        if rom_rule['use-only-at-end-of-word'] and not self.is_at_end_of_word(end):
+            return False
+        if rom_rule['use-only-for-whole-word'] \
+                and not (self.is_at_start_of_word(start) and self.is_at_end_of_word(end)):
+            return False
+        if (lcodes := rom_rule['lcodes']) and (self.lcode not in lcodes):
+            return False
+        return True
+
+    # @profile
+    def simple_sorted_romanization_candidates_for_span(self, start, end) -> List[str]:
+        s = self.s[start:end]
+        if not self.uroman.dict_bool[('s-prefix', s)]:
+            return []
+        rom_rule_candidates = []
+        for rom_rule in self.uroman.rom_rules[s]:
+            rom = rom_rule['t']
+            if self.cand_is_valid(rom_rule, start, end, rom):
+                rom_rule_candidates.append((rom_rule['n-restr'] or 0, rom_rule['t']))
+        rom_rule_candidates.sort(reverse=True)
+        return [x[1] for x in rom_rule_candidates]
+
+    def simple_top_romanization_candidate_for_span(self, start, end, simple_search: bool = False) -> str | None:
+        if (start < 0) or (end > self.max_vertex):
+            return None
+        span_range = (start, end)
+        if (cached_result := self.simple_top_rom_cache.get(span_range)) is not None:
+            return cached_result
+        best_cand, best_n_restr, best_rom_rule = None, None, None
+        for rom_rule in self.uroman.rom_rules[self.s[start:end]]:
+            if self.cand_is_valid(rom_rule, start, end, rom_rule['t']):
+                n_restr = rom_rule['n-restr'] or 0
+                if best_n_restr is None or (n_restr > best_n_restr):
+                    best_cand, best_n_restr, best_rom_rule = rom_rule['t'], n_restr, rom_rule
+        if simple_search:
+            return best_cand
+        if best_rom_rule:
+            t_at_end_of_syllable = best_rom_rule['t-at-end-of-syllable']
+            if t_at_end_of_syllable is not None:
+                is_at_end_of_syllable, rationale = self.is_at_end_of_syllable(end)
+                if is_at_end_of_syllable:
+                    best_cand = t_at_end_of_syllable
+                # print(f"   SIMPLE {start}-{end} {best_cand} ({best_rom_rule['t']},{t_at_end_of_syllable}) "
+                #       f"END:{is_at_end_of_syllable} ({rationale})")
+        self.simple_top_rom_cache[span_range] = best_cand
+        # if (best_rom_rule is not None) and ('cancel' in (prov := best_rom_rule['prov'])):
+        #     sys.stderr.write(f'   Cancel {self.s} ({start}-{end}) {prov} {self.s[start:end]}\n')
+        return best_cand
+
+    def decomp_rom(self, char_position: int) -> str | None:
+        """Input: decomposable character such as ﻼ or ½
+        Output: la or 1/2"""
+        full_string = self.s
+        char = full_string[char_position]
+        rom = None
+        if ud_decomp_s := ud.decomposition(char):
+            format_comps = []
+            other_comps = []
+            decomp_s = ''
+            # name = self.uroman.chr_name(char)
+            for ud_decomp_elem in ud_decomp_s.split():
+                if ud_decomp_elem.startswith("<"):
+                    format_comps.append(ud_decomp_elem)
+                else:
+                    try:
+                        norm_char = chr(int(ud_decomp_elem, 16))
+                    except ValueError:
+                        other_comps.append(ud_decomp_elem)
+                    else:
+                        decomp_s += norm_char
+            if (format_comps and (format_comps[0] not in ('<super>', '<sub>', '<noBreak>', '<compat>'))
+                    and (not other_comps) and decomp_s):
+                rom = self.uroman.romanize_string(decomp_s, self.lcode)
+            # make sure to add a space for 23½ -> 23 1/2
+            if rom and ud.numeric(char, None):
+                rom = rom.replace('⁄', '/')
+                if char_position >= 1 and ud.numeric(full_string[char_position-1], None):
+                    rom = ' ' + rom
+                if (char_position+1 < len(full_string)) and ud.numeric(full_string[char_position+1], None):
+                    rom += ' '
+        return rom
+
+    def add_romanization(self, **args):
+        """Adds a romanization edge to the romanization lattice."""
+        for start in range(self.max_vertex):
+            for end in range(start+1, self.max_vertex+1):
+                if not self.uroman.dict_bool[('s-prefix', self.s[start:end])]:
+                    break
+                if (rom := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
+                    if self.contains_script['Braille'] and (start+1 == end):
+                        if self.props.get(('is-upper', start)):
+                            rom = rom.upper()
+                    edge_annotation = 'rom'
+                    if regex.match(r'\+(m|ng|n|h|r)', rom):
+                        rom, edge_annotation = rom[1:], 'rom tail'
+                    rom = self.add_default_abugida_vowel(rom, start, end, annotation=edge_annotation)
+                    # orig_rom, orig_start, orig_end = rom, start, end
+                    rom, start2, end2, exp_edge_annotation \
+                        = self.expand_rom_with_special_chars(rom, start, end, annotation=edge_annotation,
+                                                             recursive=args.get('recursive', False), **args)
+                    edge_annotation = exp_edge_annotation or edge_annotation
+                    # if (orig_rom, orig_start, orig_end) != (rom, start, end):
+                    #     print(f'EXP {s} {orig_rom} {orig_start}-{orig_end} -> {rom} {start}-{end}')
+                    # if rom != rom_orig: print('** Add ABUGIDA', rom, start, end, rom2)
+                    self.add_edge(Edge(start2, end2, rom, edge_annotation))
+            if start < len(self.s):
+                char = self.s[start]
+                cp = ord(char)
+                # Korean Hangul characters
+                if 0xAC00 <= cp <= 0xD7A3:
+                    if rom := self.uroman.unicode_hangul_romanization(char):
+                        self.add_edge(Edge(start, start+1, rom, 'rom'))
+                # character decomposition
+                if rom_decomp := self.decomp_rom(start):
+                    self.add_edge(Edge(start, start + 1, rom_decomp, 'rom decomp'))
+
+    @staticmethod
+    def update_edge_list(edges, new_edge, old_edges) -> List[NumEdge]:
+        new_edge_not_yet_added = True
+        result = []
+        for edge in edges:
+            if edge in old_edges:
+                edge.active = False
+                if new_edge_not_yet_added:
+                    result.append(new_edge)
+                    new_edge_not_yet_added = False
+            else:
+                result.append(edge)
+        if new_edge_not_yet_added:
+            result.append(new_edge)
+        return result
+
+    @staticmethod
+    def edge_is_digit(edge: Edge | None) -> bool:
+        return (isinstance(edge, NumEdge)
+                and (edge.value is not None)
+                and isinstance(edge.value, int)
+                and (edge.type == 'digit')
+                and (0 <= edge.value <= 9)
+                and (edge.end - edge.start == 1))
+
+    @staticmethod
+    def is_gap_null_edge(edge: Edge) -> bool:
+        return isinstance(edge, NumEdge) and (edge.orig_txt in ('零', '〇'))
+
+    @staticmethod
+    def braille_digit(char: str) -> str | None:
+        position = '\u281A\u2801\u2803\u2809\u2819\u2811\u280B\u281B\u2813\u280A'.find(char)  # Braille 0-9
+        return str(position) if position >= 0 else None
+
+    def add_braille_number(self, start: int, end: int, txt: str, **_args) -> None:
+        new_edge = NumEdge(start, end, txt, self.uroman)
+        new_edge.type = 'number'
+        self.add_edge(new_edge)
+
+    def add_braille_numbers(self, **_args):
+        if self.contains_script['Braille']:
+            s = self.s
+            num_s, start = '', None
+            for i in range(len(s)):
+                char = s[i]
+                if char == '\u283C':  # number mark
+                    if start is None:
+                        start = i
+                elif (start is not None) and (digit_s := self.braille_digit(char)):
+                    num_s += digit_s
+                elif (start is not None) and (char == '\u2832'):  # period
+                    num_s += '.'
+                elif (start is not None) and (char == '\u2802'):  # comma
+                    num_s += ','
+                elif isinstance(start, int):
+                    self.add_braille_number(start, i, num_s)
+                    num_s, start = '', None
+            if start is not None:
+                self.add_braille_number(start, len(s), num_s)
+
+    def add_numbers(self, uroman, **args):
+        """Adds a numerical romanization edge to the romanization lattice, currently just for digits.
+        To be significantly expanded to cover complex Chinese, Egyptian, Amharic numbers."""
+        verbose = bool(args.get('verbose'))
+        s = self.s
+        num_edges = []
+        for start in range(len(s)):
+            char = s[start]
+            if uroman.num_props[char]:
+                new_edge = NumEdge(start, start + 1, char, uroman)
+                num_edges.append(new_edge)
+                if verbose:
+                    print('NumEdge', new_edge)
+                self.add_edge(new_edge)
+        # D1 sequence of digits 1234
+        for edge in num_edges:
+            if self.edge_is_digit(edge) and edge.active:  # and (edge.value != 0):
+                n_decimal_points = 0
+                n_decimals = None
+                new_value_s = str(edge.value)
+                sub_edges = [edge]
+                prev_edge = edge
+                while True:
+                    right_edge = self.best_right_neighbor_edge(prev_edge.end)
+                    if self.edge_is_digit(right_edge):
+                        sub_edges.append(right_edge)
+                        new_value_s += str(right_edge.value)
+                        if n_decimals is not None:
+                            n_decimals += 1
+                        prev_edge = right_edge
+                    elif ((prev_edge.end < len(s)) and (s[prev_edge.end] == '.') and (n_decimal_points == 0)
+                            and (right_edge2 := self.best_right_neighbor_edge(prev_edge.end + 1))
+                            and self.edge_is_digit(right_edge2)):
+                        if right_edge is None:
+                            right_edge = Edge(prev_edge.end, prev_edge.end+1, s[prev_edge.end],
+                                              'decimal period')
+                            self.add_edge(right_edge)
+                        sub_edges.append(right_edge)
+                        sub_edges.append(right_edge2)
+                        new_value_s += '.' + str(right_edge2.value)
+                        n_decimal_points += 1
+                        n_decimals = 1
+                        prev_edge = right_edge2
+                    else:
+                        break
+                if len(sub_edges) >= 2:
+                    new_value = float(new_value_s) if '.' in new_value_s else int(new_value_s)
+                    new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
+                    new_edge.update(value=new_value, value_s=new_value_s, n_decimals=n_decimals, num_base=1, 
+                                    e_type='D1', script=sub_edges[-1].script)
+                    self.add_edge(new_edge)
+                    num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
+                    if verbose:
+                        print(new_edge.type, new_edge)
+        # G1 combine (*) "single digits" 2*100=200, 3*10= 30
+        for edge in num_edges:
+            if (isinstance(edge, NumEdge) and edge.active and (edge.num_base == 1)
+                    and isinstance(edge.value, int) and (edge.value >= 1)):
+                right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
+                if (right_edge
+                        and isinstance(right_edge, NumEdge)
+                        and right_edge.active
+                        and isinstance(right_edge.value, int)
+                        and (right_edge.num_base > 1)
+                        and (not right_edge.is_large_power)):
+                    new_value = edge.value * right_edge.value
+                    new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
+                    new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G1',
+                                    orig_txt=edge.orig_txt + right_edge.orig_txt,
+                                    script=right_edge.script)
+                    self.add_edge(new_edge)
+                    num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
+                    if verbose:
+                        print(new_edge.type, new_edge)
+        # G2 combine (+) G1 "single digits" 200+30+4=234 (within larger blocks of 1000, 1000000)
+        for edge in num_edges:
+            if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int) and not edge.is_large_power:
+                sub_edges = [edge]
+                prev_edge = edge
+                prev_non_edge = edge  # None if (edge.orig_txt in '零') else prev_edge
+                while (prev_edge
+                       and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
+                       and isinstance(right_edge, NumEdge)
+                       and right_edge.active
+                       and isinstance(right_edge.value, int)
+                       and (not right_edge.is_large_power)
+                       and (self.is_gap_null_edge(prev_non_edge)
+                            or ((prev_non_edge.num_base > right_edge.value)
+                                and (prev_non_edge.num_base > right_edge.num_base)))):
+                    sub_edges.append(right_edge)
+                    prev_edge = right_edge
+                    if not self.is_gap_null_edge(right_edge):
+                        prev_non_edge = right_edge
+                if len(sub_edges) >= 2:
+                    new_value = sum([e.value for e in sub_edges])
+                    new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
+
+                    new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G2',
+                                    orig_txt=''.join([e.orig_txt for e in sub_edges]),
+                                    script=sub_edges[-1].script)
+                    self.add_edge(new_edge)
+                    num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
+                    new_edge.type = 'G2'
+                    if verbose:
+                        print(new_edge.type, new_edge)
+        # G3 combine (*) G2 blocks with large powers, e.g. 234*1000 = 234000
+        for edge in num_edges:
+            if (isinstance(edge, NumEdge) and edge.active and (not edge.is_large_power)
+                    and (isinstance(edge.value, int) or isinstance(edge.value, float))):
+                right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
+                if (right_edge
+                        and isinstance(right_edge, NumEdge)
+                        and right_edge.active
+                        and isinstance(right_edge.value, int)
+                        and (right_edge.num_base > 1)
+                        and right_edge.is_large_power):
+                    new_value = round(edge.value * right_edge.value, 5)
+                    if isinstance(new_value, float) and new_value.is_integer():
+                        new_value = int(new_value)
+                    new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
+                    new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G3',
+                                    orig_txt=edge.orig_txt + right_edge.orig_txt,
+                                    script=right_edge.script)
+                    self.add_edge(new_edge)
+                    num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
+                    if verbose:
+                        print(new_edge.type, new_edge)
+        # G4 combine (+) G3 blocks 234000+567=234567
+        for edge in num_edges:
+            if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int):
+                sub_edges = [edge]
+                while ((prev_edge := sub_edges[-1])
+                       and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
+                       and isinstance(right_edge, NumEdge)
+                       and right_edge.active
+                       and isinstance(right_edge.value, int)
+                       and (prev_edge.num_base > right_edge.value)
+                       and (prev_edge.num_base > right_edge.num_base)):
+                    if ((prev_edge.script == 'CJK')
+                            and (prev_edge.num_base >= 1000)
+                            and ('tag' not in prev_edge.type)
+                            and regex.match('10+$', str(prev_edge.num_base))
+                            and (1 <= right_edge.value <= 9)
+                            and (right_edge.start + 1 == right_edge.end)):
+                        new_num_base = prev_edge.num_base // 10
+                        new_value = new_num_base * right_edge.value
+                        # print('DIGIT TAG', prev_edge, right_edge, new_value)
+                        right_edge.value = new_value
+                        right_edge.num_base = new_num_base
+                        right_edge.type = 'G4tag'
+                    sub_edges.append(right_edge)
+                if len(sub_edges) >= 2:
+                    new_value = sum([e.value for e in sub_edges])
+                    new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
+                    new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G4',
+                                    orig_txt=''.join([e.orig_txt for e in sub_edges]),
+                                    script=sub_edges[-1].script)
+                    self.add_edge(new_edge)
+                    num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
+                    if verbose:
+                        print(new_edge.type, new_edge)
+        # F1
+        for edge in num_edges:
+            # cushion fractions with spaces as needed: e.g. 23½ -> 23 1/2 or 十一五 -> 11 5
+            if isinstance(edge, NumEdge) and regex.match(r'\d', edge.txt):
+                left_edge = self.best_left_neighbor_edge(edge.start)
+                if left_edge and regex.search(r'\d$', left_edge.txt):
+                    if edge.fraction:
+                        sep = ' '
+                    else:
+                        sep = '·'
+                    edge.txt = sep + edge.txt
+
+        for edge in num_edges:
+            if (isinstance(edge, NumEdge) and edge.active and (edge.value is not None)
+                    and (((edge.value > 1000) and (edge.start + 1 == edge.end))
+                         or (edge.orig_txt in '兩參参伍陆陸什')
+                         or (edge.orig_txt in ('京兆', )))):
+                edge.active = False
+        if verbose:  # or (num_edges and any([e.type in ['G1', 'G2', 'G3', 'G4'] for e in num_edges])):
+            if num_edges:
+                print('actives:')
+            for num_edge in num_edges:
+                print(num_edge)
+        for start in range(len(s)):
+            start_char = s[start]
+            if (best_edge := self.best_edge_in_span(start, start+1)) and isinstance(best_edge, NumEdge):
+                continue
+            if (num := ud_numeric(start_char)) is not None:
+                name = self.uroman.chr_name(start_char)
+                if ("DIGIT" in name) and isinstance(num, int) and (0 <= num <= 9):
+                    # if start_char not in '0123456789': print('DIGIT', s[start], num, name)
+                    self.add_edge(Edge(start, start + 1, str(num), 'num'))
+                else:
+                    uroman.stats[('*NUM', start_char, num)] += 1
+
+    def add_rom_fall_back_singles(self, **_args):
+        """For characters in the original string not covered by romanizations and numbers,
+        add a fallback edge based on type, romanization of single char, or original char."""
+        for start in range(self.max_vertex):
+            end = start+1
+            orig_char = self.s[start]
+            if not self.lattice[(start, end)]:
+                rom, edge_annotation = orig_char, 'orig'
+                if self.uroman.char_is_nonspacing_mark(rom):
+                    rom, edge_annotation = '', 'Mn'
+                elif self.uroman.char_is_format_char(rom):  # e.g. zero-width non-joiner, zero-width joiner
+                    rom, edge_annotation = '', 'Cf'
+                elif ud.category(orig_char) == 'Co':
+                    rom, edge_annotation = '', 'Co'
+                elif rom == ' ':
+                    edge_annotation = 'orig'
+                # elif self.uroman.char_is_space_separator(rom):
+                #     rom, edge_annotation = ' ', 'Zs'
+                elif (rom2 := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
+                    rom = rom2
+                    if regex.match(r'\+(m|ng|n|h|r)', rom):
+                        rom = rom[1:]
+                    edge_annotation = 'rom single'
+                # else the original values still hold: rom, edge_annotation = orig_char, 'orig'
+                self.add_edge(Edge(start, end, rom, edge_annotation))
+
+    @staticmethod
+    def add_new_edge(old_edges: List[Edge], start: int, end: int, new_rom: str, new_type: str, position: int | None,
+                     old_edge_dict: dict)\
+            -> None:
+        if (start, end, new_rom) not in old_edge_dict:
+            new_edge = Edge(start, end, new_rom, new_type)
+            if position is None:
+                old_edges.append(new_edge)
+            else:
+                old_edges.insert(position + 1, new_edge)
+            old_edge_dict[(start, end, new_rom)] = new_edge
+            # print(f'  ALT {start}-{end} {new_rom}')
+
+    def add_alternatives(self, old_edges: List[Edge]) -> None:
+        old_edge_dict = {}
+        for old_edge in old_edges:
+            old_edge_dict[(old_edge.start, old_edge.end, old_edge.txt)] = old_edge
+        for position, old_edge in enumerate(old_edges):
+            if old_edge.type.startswith('rom-alt'):
+                continue   # not old
+            start, end = old_edge.start, old_edge.end
+            orig_s = self.s[start:end]
+            old_rom = old_edge.txt
+            # self.lattice[(start, end)]:
+            for rom_rule in self.uroman.rom_rules[orig_s]:
+                rom_t = rom_rule['t']
+                if self.cand_is_valid(rom_rule, start, end, rom_t):
+                    rom_alts = rom_rule['t-alts']
+                    rom_eosyl = rom_rule['t-at-end-of-syllable']
+                    if (rom_t == old_rom) and rom_alts:
+                        for rom_alt in rom_alts:
+                            self.add_new_edge(old_edges, start, end, rom_alt, 'rom-alt', position,
+                                              old_edge_dict)
+                    if (rom_t == old_rom) and rom_eosyl:
+                        self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt2', position, old_edge_dict)
+                    if rom_eosyl == old_rom:
+                        self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt3', position, old_edge_dict)
+
+    def all_edges(self, start: int, end: int) -> List[Edge]:
+        result = []
+        for start2 in range(start, end):
+            for end2 in sorted(list(self.lattice[(start2, 'right')]), reverse=True):
+                if end2 <= end:
+                    result.extend(self.lattice[(start2, end2)])
+                else:
+                    break
+        return result
+
+    def best_edge_in_span(self, start: int, end: int, skip_num_edge: bool = False) -> Edge | None:
+        edges = self.lattice[(start, end)]
+        # if len(edges) >= 2: print('Multi edge', start2, end2, self.s[start2:end2], edges)
+        decomp_edge, other_edge, rom_edge = None, None, None
+        for edge in edges:
+            if isinstance(edge, NumEdge):
+                if skip_num_edge:
+                    continue
+                if edge.active:
+                    return edge
+            if edge.type.startswith('rom decomp'):
+                if decomp_edge is None:
+                    decomp_edge = edge  # plan C
+            elif regex.match(r'(?:rom|num)', edge.type):
+                if rom_edge is None:
+                    rom_edge = edge  # plan B
+            elif other_edge is None:
+                other_edge = edge  # plan D
+        return rom_edge or decomp_edge or other_edge
+
+    def best_right_neighbor_edge(self, start: int, skip_num_edge: bool = False) -> Edge | None:
+        for end in sorted(list(self.lattice[(start, 'right')]), reverse=True):
+            if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
+                return best_edge
+        return None
+
+    def best_left_neighbor_edge(self, end: int, skip_num_edge: bool = False) -> Edge | None:
+        for start in sorted(list(self.lattice[(end, 'left')])):
+            if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
+                return best_edge
+        return None
+
+    def best_rom_edge_path(self, start: int, end: int, skip_num_edge: bool = False) -> List[Edge]:
+        """Finds the best romanization edge path through the romanization lattice, including
+        non-romanized pieces such as ASCII and non-ASCII punctuation."""
+        result = []
+        start2 = start
+        while start2 < end:
+            if best_edge := self.best_right_neighbor_edge(start2, skip_num_edge=skip_num_edge):
+                result.append(best_edge)
+                start2 = best_edge.end
+            else:  # should not happen
+                start2 += 1
+        return result
+
+    def find_rom_edge_path_backwards(self, start: int, end: int, min_char: int | None = None,
+                                     return_str: bool = False, skip_num_edge: bool = False) -> List[Edge] | str:
+        """Finds a partial best path on the left from a start position to provide left contexts for
+        romanization rules. Can return a string or a list of edges. Is typically used for a short context,
+        as specified by min_char."""
+        result_edges = []
+        rom = ''
+        end2 = end
+        while start < end2:
+            old_end2 = end2
+            if new_edge := self.best_left_neighbor_edge(end2, skip_num_edge=skip_num_edge):
+                result_edges = [new_edge] + result_edges
+                rom = new_edge.txt + rom
+                end2 = new_edge.start
+            if min_char and len(rom) >= min_char:
+                break
+            if old_end2 >= end2:
+                end2 -= 1
+        if return_str:
+            return rom
+        else:
+            return result_edges
+
+    @staticmethod
+    def edge_path_to_surf(edges) -> str:
+        result = ''
+        for edge in edges:
+            result += edge.txt
+        return result
+
+
+# @timer
+def main():
+    """This function provides a user interface, either using argparse for a command line interface,
+    or providing direct function calls.
+    First, a uroman object will have to created, loading uroman data (directory must be provided,
+    listed as default). This only needs to be done once.
+    After that you can romanize from file to file, or just romanize a string."""
+
+    # Compute data_dir based on the location of this executable script.
+    src_dir = os.path.dirname(os.path.realpath(__file__))
+    root_dir = os.path.dirname(src_dir)
+    data_dir = os.path.join(root_dir, "data")
+    # print(src_dir, root_dir, data)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('direct_input', nargs='*', type=str)
+    parser.add_argument('--data_dir', type=Path, default=data_dir, help='uroman resource dir')
+    parser.add_argument('-i', '--input_filename', type=str, help='default: sys.stdin')
+    parser.add_argument('-o', '--output_filename', type=str, help='default: sys.stdout')
+    parser.add_argument('-l', '--lcode', type=str, default=None,
+                        help='ISO 639-3 language code, e.g. eng')
+    # parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, help:'alt: RomFormat.EDGES')
+    parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR,
+                        choices=list(RomFormat), help="Output format of romanization. 'edges' provides offsets")
+    # The remaining arguments are mostly for development and test
+    parser.add_argument('--max_lines', type=int, default=None, help='limit uroman to first n lines')
+    parser.add_argument('--load_log', action='count', default=0, help='report load stats')
+    parser.add_argument('--test', action='count', default=0, help='perform/display a few tests')
+    parser.add_argument('-v', '--verbose', action='count', default=0)
+    parser.add_argument('--rebuild_ud_props', action='count', default=0,
+                        help='rebuild UnicodeDataProps files (for development mode only)')
+    parser.add_argument('--rebuild_num_props', action='count', default=0,
+                        help='rebuild NumProps file (for development mode only)')
+    parser.add_argument('--no_caching', action='count', default=0, help='for development mode: speed')
+    parser.add_argument('--silent', action='count', default=0, help='suppress ... progress')
+    parser.add_argument('-a', '--ablation', type=str, default='', help='for development mode: nocap')
+    parser.add_argument('--stats', action='count', default=0, help='for development mode: numbers')
+    parser.add_argument('--ignore_args', action='count', default=0, help='for usage illustration only')
+    parser.add_argument(PROFILE_FLAG, type=argparse.FileType('w', encoding='utf-8', errors='ignore'),
+                        default=None, metavar='PROFILE-FILENAME', help='(optional output for performance analysis)')
+    args = parser.parse_args()
+    # copy selected (minor) args from argparse.Namespace to dict
+    args_dict = {'rom_format': args.rom_format, 'load_log': args.load_log, 'test': args.test, 'stats': args.stats,
+                 'no_caching': args.no_caching, 'max_lines': args.max_lines, 'verbose': args.verbose,
+                 'rebuild_ud_props': args.rebuild_ud_props, 'rebuild_num_props': args.rebuild_num_props,
+                 'ablation': args.ablation, 'silent': args.silent}
+    pr = None
+    if args.profile:
+        gc.enable()
+        gc.set_debug(gc.DEBUG_STATS)
+        gc.set_debug(gc.DEBUG_LEAK)
+        pr = cProfile.Profile()
+        pr.enable()
+    '''Sample calls:
+uroman.py --help
+uroman.py -i ../test/multi-script.txt -o ../test/multi-script-out2.txt
+uroman.py  < ../test/multi-script.txt  > ../test/multi-script-out2.txt
+uroman.py Игорь
+uroman.py Игорь --lcode ukr
+uroman.py ألاسكا 서울 Καλιφόρνια
+uroman.py ちょっとまってください -f edges
+uroman.py "महात्मा गांधी" -f lattice
+uroman.py สวัสดี --load_log
+uroman.py --test
+uroman.py --ignore_args
+uroman.py Բարեւ -o ../test/tmp-out.txt -f edges
+# In double input cases such as in the line below,
+# the input-file's romanization is sent to stdout, while the direct-input romanization is sent to stderr
+uroman.py ⴰⵣⵓⵍ -i ../test/multi-script.txt > ../test/multi-script-out2.txt
+    '''
+
+    if args.ignore_args:
+        # minimal calls
+        uroman = Uroman(args.data_dir)
+        s, s2, s3, s4 = 'Игорь', 'ちょっとまってください', 'ka‍n‍ne', 'महात्मा गांधी'
+        print(s, uroman.romanize_string(s))
+        print(s, uroman.romanize_string(s, lcode='ukr'))
+        print(s2, Edge.json_str(uroman.romanize_string(s2, rom_format=RomFormat.EDGES)))
+        print(s3, Edge.json_str(uroman.romanize_string(s3, rom_format=RomFormat.EDGES)))
+        print(s4, Edge.json_str(uroman.romanize_string(s4, rom_format=RomFormat.LATTICE)))
+        # Note that ../test/multi-script.txt has several lines starting with ::lcode eng etc.
+        # This allows users to select specific language codes to specific lines, overwriting the overall --lcodes
+        uroman.romanize_file(input_filename='../test/multi-script.txt',
+                             output_filename='../test/multi-script-out3.txt')
+    else:
+        # build a Uroman object (once for many applications and different scripts and languages)
+        uroman = Uroman(args.data_dir, load_log=args.load_log, rebuild_ud_props=args.rebuild_ud_props,
+                        rebuild_num_props=args.rebuild_num_props)
+        romanize_file_p = (args.input_filename or args.output_filename
+                           or not (args.direct_input or args.test or args.ignore_args
+                                   or args.rebuild_ud_props or args.rebuild_num_props))
+        # Romanize any positional arguments, interpreted as strings to be romanized.
+        for s in args.direct_input:
+            result = uroman.romanize_string(s.rstrip(), lcode=args.lcode, **args_dict)
+            result_json = Edge.json_str(result)
+            if romanize_file_p:
+                # input from both file/stdin (to file/stdout) and direct-input (to stderr)
+                if args.input_filename:
+                    sys.stderr.write(result_json + '\n')
+                # input from direct-input (but not from file/stdin) to stdout
+                # else pass
+            # no file/stdin or file/stdout, so we write romanization of direct-input to stdout
+            else:
+                print(result_json)
+        # If provided, apply romanization to an entire file.
+        if romanize_file_p:
+            uroman.romanize_file(args.input_filename, args.output_filename, lcode=args.lcode,
+                                 direct_input=args.direct_input, **args_dict)
+        if args.test:
+            uroman.test_output_of_selected_scripts_and_rom_rules()
+            uroman.test_romanization()
+        if uroman.stats and args.stats:
+            stats100 = {k: uroman.stats[k] for k in list(dict(uroman.stats))[:100]}
+            sys.stderr.write(f'Stats: {stats100} ...\n')
+    if args.profile:
+        if pr:
+            pr.disable()
+            ps = pstats.Stats(pr, stream=args.profile).sort_stats(pstats.SortKey.TIME)
+            ps.print_stats()
+        print(gc.get_stats())
+
+
+if __name__ == "__main__":
+    main()