nb-wav2vec2-1b-bokmaal-v2 / cardinal_numbers.py
versae's picture
NB-Wav2Vec2-1B-Bokmaal-v2
c7f7d90
raw
history blame
27.3 kB
#!/usr/bin/env python
# coding: utf-8
"""From https://github.com/peresolb/number-conversion/"""
import sys
import os
import nltk
# Dict for basic primitive numbers: 1-10
b = {
1: "én",
2: "to",
3: "tre",
4: "fire",
5: "fem",
6: "seks",
7: "sju",
8: "åtte",
9: "ni",
10: "ti",
}
b_nn = {
1: "ein",
2: "to",
3: "tre",
4: "fire",
5: "fem",
6: "seks",
7: "sju",
8: "åtte",
9: "ni",
10: "ti",
}
# Dict for teen primitive numbers: 11-19
t = {
11: "elleve",
12: "tolv",
13: "tretten",
14: "fjorten",
15: "femten",
16: "seksten",
17: "sytten",
18: "atten",
19: "nitten",
}
# Dict for two digit primitive numbers: 20-90
do = {
20: "tjue",
30: "tretti",
40: "førti",
50: "femti",
60: "seksti",
70: "sytti",
80: "åtti",
90: "nitti",
}
# Dict for the 3 digit primitive number: 100
doo = {100: "hundre"}
# Dict for the 4 digit primitive number: 1000
dooo = {1000: "tusen"}
# Reverser function for primitive number dicts
def _revdict(numberdict):
newdict = {}
for k, v in numberdict.items():
newdict[v] = k
return newdict
# The reverse of the primitive number dicts, where strings are keys and integers are values,
# are name of original dict underscore 'r'
b_r, b_nn_r, t_r, do_r, doo_r, dooo_r = (
_revdict(b),
_revdict(b_nn),
_revdict(t),
_revdict(do),
_revdict(doo),
_revdict(dooo),
)
def _oneten(nr, reverse=False, nn=False):
"""Function taking an int from 1-10 and returning the corresponding word,
or, if reverse=True, taking a numberword from 1-10 and returning the digit"""
if reverse == False:
if not type(nr) is int:
return None
if nr <= 10:
if nn == False:
return b[nr]
else:
return b_nn[nr]
else:
if not type(nr) is str:
return None
if nn == False:
if nr in b_r.keys():
return b_r[nr]
else:
if nr in b_nn_r.keys():
return b_nn_r[nr]
def _onedig(nr, reverse=False, nn=False):
if reverse == False:
if not _oneten(nr) == "ti":
if nn == False:
return _oneten(nr)
else:
return _oneten(nr, nn=True)
if reverse == True:
if not _oneten(nr, reverse=True) == 10:
if nn == False:
return _oneten(nr, reverse=True)
else:
return _oneten(nr, reverse=True, nn=True)
def _teen(nr, reverse=False):
"""Function taking a primitive two-digit int in the teen range and returning the
corresponding word, or, if reverse=True, the corresponding number word"""
if reverse == False:
if not type(nr) is int:
return None
if nr in t.keys():
return t[nr]
else:
if not type(nr) is str:
return None
if nr in t_r.keys():
return t_r[nr]
def _twodig(nr, reverse=False):
"""Function taking a primitive two-digit int in the range 20-90 and returning
the corresponding word, or, if reverse=True, the corresponding number word"""
if reverse == False:
if not type(nr) is int:
return None
if nr in do.keys():
return do[nr]
else:
if not type(nr) is str:
return None
if nr in do_r.keys():
return do_r[nr]
def _numparser(numword, nn=False):
"""Parse word to see if they start with a wd in firstnumwords.
If yes, return firstnumword and second part"""
if not type(numword) is str:
return None
firstpart = ""
scndpart = ""
firstnumwords = list(do_r.keys())
for s in firstnumwords:
if numword.startswith(s):
slength = len(s)
firstpart = s
scndpart = numword[slength:]
if nn == False:
if (
scndpart in b_r.keys() and b_r[scndpart] < 10
): # Only return if second part is dig below 10
return (firstpart, scndpart)
else:
if (
scndpart in b_nn_r.keys() and b_nn_r[scndpart] < 10
): # Only return if second part is dig below 10
return (firstpart, scndpart)
def _one_to_nineteen(nr, reverse=False, nn=False):
"""Function taking a primitive two-digit int in the range 1-19
and returning the corresponding word, or, if reverse=True, the corresponding number word"""
if reverse == False:
if not type(nr) is int:
return None
if nr < 11:
if nn == False:
return _oneten(nr)
else:
return _oneten(nr, nn=True)
elif nr < 20:
return _teen(nr)
else:
if not type(nr) is str:
return None
if nn == False:
if type(_oneten(nr, reverse=True)) is int:
return _oneten(nr, reverse=True)
elif type(_teen(nr, reverse=True)):
return _teen(nr, reverse=True)
else:
if type(_oneten(nr, reverse=True, nn=True)) is int:
return _oneten(nr, reverse=True, nn=True)
elif type(_teen(nr, reverse=True)):
return _teen(nr, reverse=True)
def _one_to_nn(nr, reverse=False, nn=False):
"""Function taking an int in the range 1-99 and returning the corresponding word. Reverse as before"""
if reverse == False:
if not type(nr) is int:
return None
if nr > 0:
if nr < 20:
if nn == False:
return _one_to_nineteen(nr)
else:
return _one_to_nineteen(nr, nn=True)
elif nr < 100:
if nr in do.keys():
return _twodig(nr)
else:
nrstring = str(nr)
frstdig = int(nrstring[0]) * 10
scndig = int(nrstring[1])
frstwd = _twodig(frstdig)
if nn == False:
scnwd = _onedig(scndig)
else:
scnwd = _onedig(scndig, nn=True)
nrwd = frstwd + scnwd
return nrwd
else:
if not type(nr) is str:
return None
if nn == False:
if type(_one_to_nineteen(nr, reverse=True)) is int:
return _one_to_nineteen(nr, reverse=True)
elif type(_twodig(nr, reverse=True)) is int:
return _twodig(nr, reverse=True)
else:
if _numparser(nr) == None:
return None
parsed = _numparser(nr)
first = _twodig(parsed[0], reverse=True)
second = _one_to_nineteen(parsed[1], reverse=True)
return first + second
else:
if type(_one_to_nineteen(nr, reverse=True, nn=True)) is int:
return _one_to_nineteen(nr, reverse=True, nn=True)
elif type(_twodig(nr, reverse=True)) is int:
return _twodig(nr, reverse=True)
else:
if _numparser(nr, nn=True) == None:
return None
parsed = _numparser(nr, nn=True)
first = _twodig(parsed[0], reverse=True)
second = _one_to_nineteen(parsed[1], reverse=True, nn=True)
return first + second
def _one_to_nnn(nr, reverse=False, nn=False):
"""Function taking an int in the range 1-999 and returning the corresponding word. Reverse as before"""
if reverse == False:
if not type(nr) is int:
return None
if nr == 0:
return None
if nr < 100: # 1-99
if nn == False:
return _one_to_nn(nr)
else:
return _one_to_nn(nr, nn=True)
elif nr < 1000:
if nr == 100: # 100
return doo[100]
else:
nrstring = str(nr) # 435 181
frstdig = int(nrstring[0]) # 4 1
scndig = int(nrstring[1]) # 3 8
thrdig = int(nrstring[2]) # 5 1
scthdig = int(nrstring[1:]) # 35 81
if nn == False:
frstwd = _onedig(frstdig) # fire
else:
frstwd = _onedig(frstdig, nn=True)
nrwd = ""
if scndig == 0: # 405 or 400
if thrdig == 0: # 400
nrwd = "%s %s" % (frstwd, doo[100]) # fire hundre
else: # 405
if nn == False:
thrdwd = _one_to_nn(thrdig) # fem
else:
thrdwd = _one_to_nn(thrdig, nn=True)
if frstdig != 1:
nrwd = "%s %s og %s" % (
frstwd,
doo[100],
thrdwd,
) # fire hundre og fem
else:
nrwd = "%s og %s" % (doo[100], thrdwd) # hundre og fem
else: # 435
scthwd = ""
if nn == False:
scthwd = _one_to_nn(scthdig) # trettifem
else:
scthwd = _one_to_nn(scthdig, nn=True)
if frstdig != 1:
nrwd = "%s %s og %s" % (frstwd, doo[100], scthwd)
else:
nrwd = "%s og %s" % (doo[100], scthwd) # hundre og trettifem
return nrwd
else:
if not type(nr) is str:
return None
if type(doo_r.get(nr, None)) is int: # hundre - 100
return doo_r[nr]
elif (
len(nr.split(" ")) == 1
and type(_one_to_nn(nr, reverse=True)) is int
and nn == False
):
return _one_to_nn(nr, reverse=True) # 44
elif (
len(nr.split(" ")) == 1
and type(_one_to_nn(nr, reverse=True, nn=True)) is int
and nn == True
):
return _one_to_nn(nr, reverse=True, nn=True) # 44
elif len(nr.split(" ")) == 2: # to hundre
splitwords = nr.split(" ")
if nn == False and nr == "ett hundre":
return 100
elif nn == True and nr == "eitt hundre":
return 100
elif (
type(_one_to_nn(splitwords[0], reverse=True)) is int
and splitwords[1] == "hundre"
):
return _one_to_nn(splitwords[0], reverse=True) * 100
elif len(nr.split(" ")) == 3: # hundre og tre
splitwords = nr.split(" ")
if splitwords[0] == "hundre" and splitwords[1] == "og":
if nn == False:
if type(_one_to_nn(splitwords[2], reverse=True)) is int:
return 100 + _one_to_nn(splitwords[2], reverse=True)
else:
if type(_one_to_nn(splitwords[2], reverse=True, nn=True)) is int:
return 100 + _one_to_nn(splitwords[2], reverse=True, nn=True)
else:
return None
elif len(nr.split(" ")) == 4: # ett hundre og trettifire, fire hundre og åtte
splitwords = nr.split(" ")
if nn == False:
if (
splitwords[0] == "ett"
and splitwords[1] == "hundre"
and splitwords[2] == "og"
and type(_one_to_nn(splitwords[3], reverse=True)) is int
): # ett hundre og trettifire
return 100 + _one_to_nn(splitwords[3], reverse=True)
elif (
type(_one_to_nn(splitwords[0], reverse=True)) is int
and _one_to_nn(splitwords[0], reverse=True) < 10
and splitwords[1] == "hundre"
and splitwords[2] == "og"
and type(_one_to_nn(splitwords[3], reverse=True)) is int
): # fire hundre og trettifire
hundreds = _one_to_nn(splitwords[0], reverse=True) * 100
tens = _one_to_nn(splitwords[3], reverse=True)
return hundreds + tens
else:
return None
else:
if (
splitwords[0] == "eitt"
and splitwords[1] == "hundre"
and splitwords[2] == "og"
and type(_one_to_nn(splitwords[3], reverse=True, nn=True)) is int
): # eit hundre og trettifire
return 100 + _one_to_nn(splitwords[3], reverse=True, nn=True)
elif (
type(_one_to_nn(splitwords[0], reverse=True, nn=True)) is int
and _one_to_nn(splitwords[0], reverse=True, nn=True) < 10
and splitwords[1] == "hundre"
and splitwords[2] == "og"
and type(_one_to_nn(splitwords[3], reverse=True, nn=True)) is int
): # fire hundre og trettifire
hundreds = _one_to_nn(splitwords[0], reverse=True, nn=True) * 100
tens = _one_to_nn(splitwords[3], reverse=True, nn=True)
return hundreds + tens
else:
return None
def _high_hundred(nr, nn=False):
"""In Norwegian, as in English, it is possible to express the numbers 1100-1999 with hundreds,
e.g. "tolv hundre og nittiåtte", /twelve hundred and ninety-eight/. We want to be able to convert
these to integers. However, we don't need to produce them, so this algoritm only goes from strings to integers"""
if not type(nr) is str:
return None
if len(nr.split(" ")) > 1:
frstwd = nr.split(" ")[0]
if not type(_teen(frstwd, reverse=True)) is int:
return None
frstdig = _teen(frstwd, reverse=True)
if len(nr.split(" ")) == 2 and nr.split(" ")[1] == "hundre": # femten hundre
return frstdig * 100
elif (
len(nr.split(" ")) == 4
and nr.split(" ")[1] == "hundre"
and nr.split(" ")[2] == "og"
):
if (
nn == False and type(_one_to_nn(nr.split(" ")[3], reverse=True)) is int
): # femten hundre og førtito
lastdigs = _one_to_nn(nr.split(" ")[3], reverse=True)
return (frstdig * 100) + lastdigs
elif (
nn == True
and type(_one_to_nn(nr.split(" ")[3], reverse=True, nn=True)) is int
): # femten hundre og førtito
lastdigs = _one_to_nn(nr.split(" ")[3], reverse=True, nn=True)
return (frstdig * 100) + lastdigs
def _one_to_nnnnnn(nr, reverse=False, nn=False):
"""Function taking an int in the range 1-999999 and returning the corresponding word. Reverse as before"""
if reverse == False:
if not type(nr) is int:
return None
if nr == 0:
return None
if nr < 1000: # 1-999
if nn == False:
return _one_to_nnn(nr)
else:
return _one_to_nnn(nr, nn=True)
elif nr < 1000000: # 1000-999999
if nr == 1000: # 1000
if nn == False:
return "ett tusen"
else:
return "eitt tusen"
else:
nrstring = str(nr) # Starting with last three digits. e.g. 23[456]
ultdig = int(nrstring[-1]) # 6
penultdig = int(nrstring[-2]) # 5
antepenultdig = int(nrstring[-3]) # 4
ult_and_penultdig = int(nrstring[-2:]) # 56
ult_penult_antepenultdig = int(nrstring[-3:]) # 456
tailstring = ""
if antepenultdig == 0: # 012, 002, 000
if penultdig == 0: # 000, 002
if ultdig == 0: # 000
tailstring = "tusen"
else: # 002
if nn == False:
ultstring = _one_to_nnn(ultdig)
tailstring = "tusen og %s" % ultstring # tusen og to
else:
ultstring = _one_to_nnn(ultdig, nn=True)
tailstring = "tusen og %s" % ultstring # tusen og to
else: # 012
if nn == False:
ult_and_penultstring = _one_to_nnn(ult_and_penultdig)
tailstring = (
"tusen og %s" % ult_and_penultstring
) # tusen og tolv
else:
ult_and_penultstring = _one_to_nnn(
ult_and_penultdig, nn=True
)
tailstring = (
"tusen og %s" % ult_and_penultstring
) # tusen og tolv
else: # 456
if nn == False:
ult_penult_antepenultstring = _one_to_nnn(
ult_penult_antepenultdig
)
if str(ult_penult_antepenultdig)[0] == "1":
tailstring = (
"tusen ett %s" % ult_penult_antepenultstring
) # tusen ett hundre
else:
tailstring = (
"tusen %s" % ult_penult_antepenultstring
) # tusen fire hundre og femtiseks
else:
ult_penult_antepenultstring = _one_to_nnn(
ult_penult_antepenultdig, nn=True
)
if str(ult_penult_antepenultdig)[0] == "1":
tailstring = (
"tusen eitt %s" % ult_penult_antepenultstring
) # tusen ett hundre
else:
tailstring = (
"tusen %s" % ult_penult_antepenultstring
) # tusen fire hundre og femtiseks
startdigs = int(
nrstring[:-3]
) # startstring can consist of the 1, 2 or 3 first digits
startstring = ""
if startdigs == 1: # 1001 starts with "ett"
if nn == False:
startstring = "ett"
else:
startstring = "eitt"
elif startdigs > 99 and startdigs < 200: # 155555 starts with ett
if nn == False:
startnumstring = _one_to_nnn(startdigs)
startstring = "ett %s" % startnumstring
else:
startnumstring = _one_to_nnn(startdigs, nn=True)
startstring = "eitt %s" % startnumstring
else: # the remaining numbers are purely compositional
if nn == False:
startstring = _one_to_nnn(startdigs)
else:
startstring = _one_to_nnn(startdigs, nn=True)
numstring = "%s %s" % (startstring, tailstring)
return numstring
else:
if not type(nr) is str:
return None
if type(_one_to_nnn(nr, reverse=True)) is int and nn == False:
return _one_to_nnn(nr, reverse=True) # 444
elif type(_one_to_nnn(nr, reverse=True, nn=True)) is int and nn == True:
return _one_to_nnn(nr, reverse=True, nn=True) # 444
elif nr == "tusen": # tusen - 1000
return 1000
elif (
len(nr.split(" ")) > 1 and nr.split(" ")[-1] == "tusen"
): # ett tusen, ett hundre tusen etc.
wdlist = nr.split(" ")
firstphrase = " ".join(wdlist[:-1])
if type(_one_to_nnn(firstphrase, reverse=True)) is int and nn == False:
firstdig = _one_to_nnn(firstphrase, reverse=True)
return firstdig * 1000
elif (
type(_one_to_nnn(firstphrase, reverse=True, nn=True)) is int
and nn == True
):
firstdig = _one_to_nnn(firstphrase, reverse=True, nn=True)
return firstdig * 1000
elif (
len(nr.split(" ")) == 2 and nr.split(" ")[0] == "ett" and nn == False
): # ett tusen
return 1000
elif len(nr.split(" ")) == 2 and nr.split(" ")[0] == "eitt" and nn == True:
return 1000
else: # misspellings should not result in return value
return None
else:
if len(nr.split(" ")) > 1: # all other numbers should contain spaces
numwordlist = nr.split(" ")
if (
"tusen" in numwordlist
): # Find last part of numphrase, which starts with "tusen"
tusenind = numwordlist.index("tusen") # find index of "tusen"
lastwords = numwordlist[tusenind:] # words from 'tusen'
firstwords = numwordlist[:tusenind] # words until 'tusen'
lastdigs = 0
if len(lastwords) == 3:
if lastwords[1] == "og": # 'tusen og fire' 'tusen og førtifire'
lastword = lastwords[-1]
if nn == False:
lastdigs = _one_to_nnn(lastword, reverse=True)
elif nn == True:
lastdigs = _one_to_nnn(lastword, reverse=True, nn=True)
elif (
nn == False
and type(_one_to_nnn(" ".join(lastwords[1:]), reverse=True))
is int
and lastwords[2] == "hundre"
): # tusen to hundre
hundredphrase = " ".join(lastwords[1:])
lastdigs = _one_to_nnn(hundredphrase, reverse=True)
elif (
nn == True
and type(
_one_to_nnn(
" ".join(lastwords[1:]), reverse=True, nn=True
)
)
is int
and lastwords[2] == "hundre"
): # tusen to hundre
hundredphrase = " ".join(lastwords[1:])
lastdigs = _one_to_nnn(hundredphrase, reverse=True, nn=True)
else: # misspellings should not result in return value
return None
elif (
len(lastwords) == 5 and lastwords[2] == "hundre"
): # 'tusen fire hundre og fem'
hundredphrase = " ".join(lastwords[1:])
if nn == False:
lastdigs = _one_to_nnn(hundredphrase, reverse=True)
else:
lastdigs = _one_to_nnn(hundredphrase, reverse=True, nn=True)
else: # misspellings should not result in return value
return None
firstdigs = 0
firstphrase = " ".join(firstwords)
if len(firstwords) == 0: # as in 'tusen og tretti'
firstdigs = 1000
elif (
len(firstwords) == 1 and firstwords[0] == "ett" and nn == False
):
firstdigs = 1000
elif (
len(firstwords) == 1 and firstwords[0] == "eitt" and nn == True
):
firstdigs = 1000
elif (
type(_one_to_nnn(firstphrase, reverse=True)) is int
and nn == False
):
firstdigs = _one_to_nnn(firstphrase, reverse=True) * 1000
elif (
type(_one_to_nnn(firstphrase, reverse=True, nn=True)) is int
and nn == True
):
firstdigs = (
_one_to_nnn(firstphrase, reverse=True, nn=True) * 1000
)
else: # misspellings should not result in return value
return None
if type(firstdigs) is int and type(lastdigs) is int:
return firstdigs + lastdigs
def convert_nums(nr, reverse=False, nn=False):
"""Functions for converting numbers. Only works for numbers in range 0-999999 for now"""
if reverse == False:
if type(nr) is int:
returnstring = ""
if nr == 0:
returnstring = "null"
elif nr < 1000000:
if nn == False:
returnstring = _one_to_nnnnnn(nr)
else:
returnstring = _one_to_nnnnnn(nr, nn=True)
else:
return None
return returnstring
else:
if type(nr) is str:
returnint = 0
if nr == "null":
returnint = 0
elif nn == False and type(_one_to_nnnnnn(nr, reverse=True)) is int:
returnint = _one_to_nnnnnn(nr, reverse=True)
elif nn == True and type(_one_to_nnnnnn(nr, reverse=True, nn=True)) is int:
returnint = _one_to_nnnnnn(nr, reverse=True, nn=True)
elif nn == False and type(_high_hundred(nr)) is int:
returnint = _high_hundred(nr)
elif nn == True and type(_high_hundred(nr, nn=True)) is int:
returnint = _high_hundred(nr, nn=True)
else:
return None
return returnint
if __name__ == "__main__":
# testing
mydigit = 243564
mynumstring = "hundre og atten tusen fire hundre og trettién"
mydigit_nn = 34381
mynumstring_nn = "tre hundre og førtiein"
print(
"Digit conversion bm: %s\nString conversion bm: %s"
% (convert_nums(mydigit), convert_nums(mynumstring, reverse=True))
)
print(
"Digit conversion nn: %s\nString conversion nn: %s"
% (
convert_nums(mydigit_nn, nn=True),
convert_nums(mynumstring_nn, nn=True, reverse=True),
)
)