Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
import re | |
import unicodedata | |
""" | |
List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature | |
MKB removed the following elements from the list: | |
- et 🙰 U+1F670 🙰 | |
- ſs, ſz ẞ, ß U+00DF ß | |
Additional notes: | |
* Some classes of characters were listed in the original utf8 fixes but I'm not | |
sure they don't belong elsewhere (end user processing). In these cases, pass | |
through unidecode should normalize them to proper ascii. They are listed here | |
with reasoning: | |
- Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf | |
r'[\u0300-\u036F]': '' | |
- Ditch chars that sometimes (incorrectly?) appear as combining diacritics | |
r'(?:\xa8|[\u02C0-\u02DF])': '' | |
* Should we run ftfy? | |
""" | |
ligature_table = """ | |
AA, aa Ꜳ, ꜳ U+A732, U+A733 Ꜳ ꜳ | |
AE, ae Æ, æ U+00C6, U+00E6 Æ æ | |
AO, ao Ꜵ, ꜵ U+A734, U+A735 Ꜵ ꜵ | |
AU, au Ꜷ, ꜷ U+A736, U+A737 Ꜷ ꜷ | |
AV, av Ꜹ, ꜹ U+A738, U+A739 Ꜹ ꜹ | |
AV, av Ꜻ, ꜻ U+A73A, U+A73B Ꜻ ꜻ | |
AY, ay Ꜽ, ꜽ U+A73C, U+A73D Ꜽ ꜽ | |
ff ff U+FB00 ff | |
ffi ffi U+FB03 ffi | |
ffl ffl U+FB04 ffl | |
fi fi U+FB01 fi | |
fl fl U+FB02 fl | |
OE, oe Œ, œ U+0152, U+0153 Œ œ | |
OO, oo Ꝏ, ꝏ U+A74E, U+A74F Ꝏ ꝏ | |
st st U+FB06 st | |
ſt ſt U+FB05 ſt | |
TZ, tz Ꜩ, ꜩ U+A728, U+A729 Ꜩ ꜩ | |
ue ᵫ U+1D6B ᵫ | |
VY, vy Ꝡ, ꝡ U+A760, U+A761 Ꝡ ꝡ | |
db ȸ U+0238 ȸ | |
dz ʣ U+02A3 ʣ | |
dʑ ʥ U+02A5 ʥ | |
dʒ ʤ U+02A4 ʤ | |
fŋ ʩ U+02A9 ʩ | |
IJ, ij IJ, ij U+0132, U+0133 IJ ij | |
ls ʪ U+02AA ʪ | |
lz ʫ U+02AB ʫ | |
lʒ ɮ U+026E ɮ | |
qp ȹ U+0239 ȹ | |
tɕ ʨ U+02A8 ʨ | |
ts ʦ U+02A6 ʦ | |
tʃ ʧ U+02A7 ʧ | |
ui ꭐ U+AB50 ꭐ | |
ui ꭑ U+AB51 ꭐ | |
""" | |
unicode_mapping = {} | |
for row in ligature_table.split('\n'): | |
if row.count('\t') <= 1: | |
continue | |
unicode_mapping.update( | |
{ | |
u.strip(): unicodedata.normalize('NFKC', a.strip()) | |
for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]]) | |
} | |
) | |
unicode_mapping.update({ | |
# 'ẞ, ß': careful, some use this for \beta | |
r'(\B)\u00DF': r'\1ss', | |
# Additions (manual normalization that we feel is important) | |
# unicode space u'\xa0' (not \x{0c} = ^L keep!) | |
'\xa0': ' ', | |
# single + double quotes, dash, and asterisk | |
r'[\u2018\u2019]': r"'", | |
r'[\u201C\u201D]': r'"', | |
r'[\xad\u2014]': r'-', | |
r'\xb7': r'*' | |
}) | |
def fix_unicode(txt: str) -> str: | |
""" | |
Given UTF-8 encoded text, remove typographical ligatures (normalize to true | |
non-display character set) and do a general normalization of the unicode | |
so that possible redundant characters and simplified to a single set. | |
Parameters | |
---------- | |
txt : unicode string | |
Returns | |
------- | |
output : unicode string | |
""" | |
for search, replace in unicode_mapping.items(): | |
txt = re.subn(search, replace, txt)[0] | |
return unicodedata.normalize('NFKC', txt) | |