Initial model
Browse files- README.md +220 -0
- malromur_test.csv +0 -0
- normalizer.py +139 -0
- num2words/__init__.py +50 -0
- num2words/base.py +306 -0
- num2words/compat.py +29 -0
- num2words/currency.py +50 -0
- num2words/lang_EU.py +93 -0
- num2words/lang_IS.py +128 -0
- num2words/utils.py +35 -0
README.md
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: is
|
3 |
+
datasets:
|
4 |
+
- malromur
|
5 |
+
tags:
|
6 |
+
- audio
|
7 |
+
- automatic-speech-recognition
|
8 |
+
- speech
|
9 |
+
- xlsr-fine-tuning-week
|
10 |
+
license: apache-2.0
|
11 |
+
widget:
|
12 |
+
- label: Malromur sample 11
|
13 |
+
src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/resolve/main/sample11.flac
|
14 |
+
- label: Malromur sample 74
|
15 |
+
src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/resolve/main/sample74.flac
|
16 |
+
model-index:
|
17 |
+
- name: XLSR Wav2Vec2 Icelandic by Mehrdad Farahani
|
18 |
+
results:
|
19 |
+
- task:
|
20 |
+
name: Speech Recognition
|
21 |
+
type: automatic-speech-recognition
|
22 |
+
dataset:
|
23 |
+
name: Malromur is
|
24 |
+
type: malromur
|
25 |
+
args: lt
|
26 |
+
metrics:
|
27 |
+
- name: Test WER
|
28 |
+
type: wer
|
29 |
+
value: 12.00
|
30 |
+
|
31 |
+
---
|
32 |
+
|
33 |
+
# Wav2Vec2-Large-XLSR-53-Icelandic
|
34 |
+
|
35 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Icelandic using [Malromur](https://clarin.is/en/resources/malromur/). When using this model, make sure that your speech input is sampled at 16kHz.
|
36 |
+
|
37 |
+
## Usage
|
38 |
+
The model can be used directly (without a language model) as follows:
|
39 |
+
|
40 |
+
**Requirements**
|
41 |
+
```bash
|
42 |
+
# requirement packages
|
43 |
+
!pip install git+https://github.com/huggingface/datasets.git
|
44 |
+
!pip install git+https://github.com/huggingface/transformers.git
|
45 |
+
!pip install torchaudio
|
46 |
+
!pip install librosa
|
47 |
+
!pip install jiwer
|
48 |
+
!pip install num2words
|
49 |
+
```
|
50 |
+
|
51 |
+
**Normalizer**
|
52 |
+
```bash
|
53 |
+
|
54 |
+
# num2word packages
|
55 |
+
# Original source: https://github.com/savoirfairelinux/num2words
|
56 |
+
!mkdir -p ./num2words
|
57 |
+
!wget -O num2words/__init__.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/__init__.py
|
58 |
+
!wget -O num2words/base.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/base.py
|
59 |
+
!wget -O num2words/compat.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/compat.py
|
60 |
+
!wget -O num2words/currency.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/currency.py
|
61 |
+
!wget -O num2words/lang_EU.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/lang_EU.py
|
62 |
+
!wget -O num2words/lang_IS.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/lang_IS.py
|
63 |
+
!wget -O num2words/utils.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/utils.py
|
64 |
+
|
65 |
+
# Malromur_test selected based on gender and age
|
66 |
+
!wget -O malromur_test.csv https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/malromur_test.csv
|
67 |
+
|
68 |
+
# Normalizer
|
69 |
+
!wget -O normalizer.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/normalizer.py
|
70 |
+
```
|
71 |
+
|
72 |
+
**Prediction**
|
73 |
+
```python
|
74 |
+
import librosa
|
75 |
+
import torch
|
76 |
+
import torchaudio
|
77 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
78 |
+
from datasets import load_dataset
|
79 |
+
|
80 |
+
import numpy as np
|
81 |
+
import re
|
82 |
+
import string
|
83 |
+
|
84 |
+
import IPython.display as ipd
|
85 |
+
|
86 |
+
from normalizer import Normalizer
|
87 |
+
|
88 |
+
normalizer = Normalizer(lang="is")
|
89 |
+
|
90 |
+
|
91 |
+
def speech_file_to_array_fn(batch):
|
92 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
93 |
+
speech_array = speech_array.squeeze().numpy()
|
94 |
+
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
|
95 |
+
|
96 |
+
batch["speech"] = speech_array
|
97 |
+
return batch
|
98 |
+
|
99 |
+
|
100 |
+
def predict(batch):
|
101 |
+
features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
102 |
+
|
103 |
+
input_values = features.input_values.to(device)
|
104 |
+
attention_mask = features.attention_mask.to(device)
|
105 |
+
|
106 |
+
with torch.no_grad():
|
107 |
+
logits = model(input_values, attention_mask=attention_mask).logits
|
108 |
+
|
109 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
110 |
+
|
111 |
+
batch["predicted"] = processor.batch_decode(pred_ids)[0]
|
112 |
+
return batch
|
113 |
+
|
114 |
+
|
115 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
116 |
+
processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic")
|
117 |
+
model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic").to(device)
|
118 |
+
|
119 |
+
dataset = load_dataset("csv", data_files={"test": "./malromur_test.csv"})["test"]
|
120 |
+
dataset = dataset.map(
|
121 |
+
normalizer,
|
122 |
+
fn_kwargs={"remove_extra_space": True},
|
123 |
+
remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
|
124 |
+
)
|
125 |
+
|
126 |
+
dataset = dataset.map(speech_file_to_array_fn)
|
127 |
+
result = dataset.map(predict)
|
128 |
+
|
129 |
+
max_items = np.random.randint(0, len(result), 20).tolist()
|
130 |
+
for i in max_items:
|
131 |
+
reference, predicted = result["sentence"][i], result["predicted"][i]
|
132 |
+
print("reference:", reference)
|
133 |
+
print("predicted:", predicted)
|
134 |
+
print('---')
|
135 |
+
```
|
136 |
+
|
137 |
+
**Output:**
|
138 |
+
```text
|
139 |
+
SOON
|
140 |
+
```
|
141 |
+
|
142 |
+
|
143 |
+
## Evaluation
|
144 |
+
|
145 |
+
The model can be evaluated as follows on the test data of Common Voice.
|
146 |
+
|
147 |
+
```python
|
148 |
+
import librosa
|
149 |
+
import torch
|
150 |
+
import torchaudio
|
151 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
152 |
+
from datasets import load_dataset, load_metric
|
153 |
+
|
154 |
+
import numpy as np
|
155 |
+
import re
|
156 |
+
import string
|
157 |
+
|
158 |
+
from normalizer import Normalizer
|
159 |
+
|
160 |
+
normalizer = Normalizer(lang="is")
|
161 |
+
|
162 |
+
|
163 |
+
def speech_file_to_array_fn(batch):
|
164 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
165 |
+
speech_array = speech_array.squeeze().numpy()
|
166 |
+
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
|
167 |
+
|
168 |
+
batch["speech"] = speech_array
|
169 |
+
return batch
|
170 |
+
|
171 |
+
|
172 |
+
def predict(batch):
|
173 |
+
features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
174 |
+
|
175 |
+
input_values = features.input_values.to(device)
|
176 |
+
attention_mask = features.attention_mask.to(device)
|
177 |
+
|
178 |
+
with torch.no_grad():
|
179 |
+
logits = model(input_values, attention_mask=attention_mask).logits
|
180 |
+
|
181 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
182 |
+
|
183 |
+
batch["predicted"] = processor.batch_decode(pred_ids)[0]
|
184 |
+
return batch
|
185 |
+
|
186 |
+
|
187 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
188 |
+
processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic")
|
189 |
+
model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic").to(device)
|
190 |
+
|
191 |
+
dataset = load_dataset("csv", data_files={"test": "./malromur_test.csv"})["test"]
|
192 |
+
dataset = dataset.map(
|
193 |
+
normalizer,
|
194 |
+
fn_kwargs={"remove_extra_space": True},
|
195 |
+
remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
|
196 |
+
)
|
197 |
+
|
198 |
+
dataset = dataset.map(speech_file_to_array_fn)
|
199 |
+
result = dataset.map(predict)
|
200 |
+
|
201 |
+
wer = load_metric("wer")
|
202 |
+
|
203 |
+
print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
|
204 |
+
```
|
205 |
+
]
|
206 |
+
|
207 |
+
**Test Result**:
|
208 |
+
- WER: 12.00%
|
209 |
+
|
210 |
+
|
211 |
+
## Training & Report
|
212 |
+
The Common Voice `train`, `validation` datasets were used for training.
|
213 |
+
|
214 |
+
You can see the training states [here](#)
|
215 |
+
|
216 |
+
The script used for training can be found [here](#)
|
217 |
+
|
218 |
+
|
219 |
+
## Questions?
|
220 |
+
Post a Github issue on the [Wav2Vec](https://github.com/m3hrdadfi/wav2vec) repo.
|
malromur_test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
normalizer.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import sys
|
3 |
+
import textwrap
|
4 |
+
from typing import Any, Dict, Optional
|
5 |
+
from num2words import num2words
|
6 |
+
|
7 |
+
|
8 |
+
class Normalizer:
|
9 |
+
"""A general normalizer for every language"""
|
10 |
+
|
11 |
+
_whitelist = r"[0-9a-zádðéíóúýþæö]+"
|
12 |
+
_dictionary = {}
|
13 |
+
_text_key_name: str = "sentence"
|
14 |
+
_do_lowercase: bool = True
|
15 |
+
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
whitelist: str = None,
|
19 |
+
dictionary: Dict[str, str] = None,
|
20 |
+
lang: str = None
|
21 |
+
) -> None:
|
22 |
+
self.text_key_name = self._text_key_name
|
23 |
+
self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
|
24 |
+
self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
|
25 |
+
self.do_lowercase = self._do_lowercase
|
26 |
+
self.lang = lang
|
27 |
+
|
28 |
+
def chars_to_map(self, sentence: str) -> str:
|
29 |
+
"""Maps every character, words, and phrase into a proper one.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
sentence (str): A piece of text.
|
33 |
+
"""
|
34 |
+
if not len(self.dictionary) > 0:
|
35 |
+
return sentence
|
36 |
+
|
37 |
+
pattern = "|".join(map(re.escape, self.dictionary.keys()))
|
38 |
+
return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
|
39 |
+
|
40 |
+
def chars_to_preserve(
|
41 |
+
self,
|
42 |
+
sentence: str,
|
43 |
+
) -> str:
|
44 |
+
"""Keeps specified characters from sentence
|
45 |
+
|
46 |
+
Args:
|
47 |
+
sentence (str): A piece of text.
|
48 |
+
"""
|
49 |
+
try:
|
50 |
+
tokenized = re.findall(self.whitelist, sentence, re.IGNORECASE)
|
51 |
+
return " ".join(tokenized)
|
52 |
+
except Exception as error:
|
53 |
+
print(
|
54 |
+
textwrap.dedent(
|
55 |
+
f"""
|
56 |
+
Bad characters range {self.whitelist},
|
57 |
+
{error}
|
58 |
+
"""
|
59 |
+
)
|
60 |
+
)
|
61 |
+
raise
|
62 |
+
|
63 |
+
def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str:
|
64 |
+
"""A text level of normalization.
|
65 |
+
It is handy for some languages that need to add a hierarchy of
|
66 |
+
normalization and filtering at the text level.
|
67 |
+
|
68 |
+
Args:
|
69 |
+
sentence (str): A piece of text.
|
70 |
+
"""
|
71 |
+
text = sentence
|
72 |
+
if not self.lang:
|
73 |
+
return text
|
74 |
+
|
75 |
+
_text = []
|
76 |
+
for word in text.split():
|
77 |
+
|
78 |
+
try:
|
79 |
+
word = int(word)
|
80 |
+
word = str(num2words(word, lang=self.lang))
|
81 |
+
except:
|
82 |
+
word = str(word)
|
83 |
+
|
84 |
+
_text.append(word)
|
85 |
+
|
86 |
+
return " ".join(_text)
|
87 |
+
|
88 |
+
def __call__(
|
89 |
+
self,
|
90 |
+
batch: Dict,
|
91 |
+
return_dict: bool = True,
|
92 |
+
do_lastspace_removing: bool = False,
|
93 |
+
text_key_name: Optional[str] = None,
|
94 |
+
do_lowercase: Optional[bool] = None,
|
95 |
+
*args: Any,
|
96 |
+
**kwargs: Any,
|
97 |
+
) -> Any:
|
98 |
+
"""Normalization caller
|
99 |
+
|
100 |
+
Args:
|
101 |
+
batch (Dict): A batch of input.
|
102 |
+
text_key_name (str, optional): The key name of text in the batch input.
|
103 |
+
return_dict (bool, optional): Whether to return dictionary of batch or not just the text. Defaults to True.
|
104 |
+
do_lastspace_removing (bool, optional): Whether to add extra space at the end of text or not. Defaults to True.
|
105 |
+
do_lowercase (bool, optional): Whether to do lowercase or not. Defaults to None.
|
106 |
+
"""
|
107 |
+
|
108 |
+
text_key_name = text_key_name if text_key_name else self.text_key_name
|
109 |
+
do_lowercase = do_lowercase if isinstance(do_lowercase, bool) else self.do_lowercase
|
110 |
+
|
111 |
+
if text_key_name not in batch:
|
112 |
+
raise KeyError(
|
113 |
+
textwrap.dedent(
|
114 |
+
f"""
|
115 |
+
keyname {text_key_name} not existed in the batch dictionary,
|
116 |
+
the batch dictionary consists of the following keys {list(batch.keys())},
|
117 |
+
you can easily add a new keyname by passing the `text_key_name` into Normalizer.
|
118 |
+
"""
|
119 |
+
)
|
120 |
+
)
|
121 |
+
|
122 |
+
text = batch[text_key_name].strip()
|
123 |
+
|
124 |
+
if do_lowercase:
|
125 |
+
text = text.lower()
|
126 |
+
|
127 |
+
text = self.chars_to_map(text)
|
128 |
+
text = self.chars_to_preserve(text)
|
129 |
+
text = self.text_level_normalizer(text, *args, **kwargs)
|
130 |
+
|
131 |
+
text = text.strip()
|
132 |
+
if not do_lastspace_removing:
|
133 |
+
text = text + " "
|
134 |
+
|
135 |
+
if not return_dict:
|
136 |
+
return text
|
137 |
+
|
138 |
+
batch[text_key_name] = text
|
139 |
+
return batch
|
num2words/__init__.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
3 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
4 |
+
|
5 |
+
# This library is free software; you can redistribute it and/or
|
6 |
+
# modify it under the terms of the GNU Lesser General Public
|
7 |
+
# License as published by the Free Software Foundation; either
|
8 |
+
# version 2.1 of the License, or (at your option) any later version.
|
9 |
+
# This library is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12 |
+
# Lesser General Public License for more details.
|
13 |
+
# You should have received a copy of the GNU Lesser General Public
|
14 |
+
# License along with this library; if not, write to the Free Software
|
15 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
16 |
+
# MA 02110-1301 USA
|
17 |
+
|
18 |
+
from __future__ import unicode_literals
|
19 |
+
|
20 |
+
from . import (
|
21 |
+
lang_IS
|
22 |
+
)
|
23 |
+
|
24 |
+
CONVERTER_CLASSES = {
|
25 |
+
'is': lang_IS.Num2Word_IS()
|
26 |
+
}
|
27 |
+
|
28 |
+
CONVERTES_TYPES = ['cardinal', 'ordinal', 'ordinal_num', 'year', 'currency']
|
29 |
+
|
30 |
+
|
31 |
+
def num2words(number, ordinal=False, lang='en', to='cardinal', **kwargs):
|
32 |
+
# We try the full language first
|
33 |
+
if lang not in CONVERTER_CLASSES:
|
34 |
+
# ... and then try only the first 2 letters
|
35 |
+
lang = lang[:2]
|
36 |
+
if lang not in CONVERTER_CLASSES:
|
37 |
+
raise NotImplementedError()
|
38 |
+
converter = CONVERTER_CLASSES[lang]
|
39 |
+
|
40 |
+
if isinstance(number, str):
|
41 |
+
number = converter.str_to_number(number)
|
42 |
+
|
43 |
+
# backwards compatible
|
44 |
+
if ordinal:
|
45 |
+
return converter.to_ordinal(number)
|
46 |
+
|
47 |
+
if to not in CONVERTES_TYPES:
|
48 |
+
raise NotImplementedError()
|
49 |
+
|
50 |
+
return getattr(converter, 'to_{}'.format(to))(number, **kwargs)
|
num2words/base.py
ADDED
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
3 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
4 |
+
|
5 |
+
# This library is free software; you can redistribute it and/or
|
6 |
+
# modify it under the terms of the GNU Lesser General Public
|
7 |
+
# License as published by the Free Software Foundation; either
|
8 |
+
# version 2.1 of the License, or (at your option) any later version.
|
9 |
+
# This library is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12 |
+
# Lesser General Public License for more details.
|
13 |
+
# You should have received a copy of the GNU Lesser General Public
|
14 |
+
# License along with this library; if not, write to the Free Software
|
15 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
16 |
+
# MA 02110-1301 USA
|
17 |
+
|
18 |
+
from __future__ import unicode_literals
|
19 |
+
|
20 |
+
import math
|
21 |
+
from collections import OrderedDict
|
22 |
+
from decimal import Decimal
|
23 |
+
|
24 |
+
from .compat import to_s
|
25 |
+
from .currency import parse_currency_parts, prefix_currency
|
26 |
+
|
27 |
+
|
28 |
+
class Num2Word_Base(object):
|
29 |
+
CURRENCY_FORMS = {}
|
30 |
+
CURRENCY_ADJECTIVES = {}
|
31 |
+
|
32 |
+
def __init__(self):
|
33 |
+
self.is_title = False
|
34 |
+
self.precision = 2
|
35 |
+
self.exclude_title = []
|
36 |
+
self.negword = "(-) "
|
37 |
+
self.pointword = "(.)"
|
38 |
+
self.errmsg_nonnum = "type(%s) not in [long, int, float]"
|
39 |
+
self.errmsg_floatord = "Cannot treat float %s as ordinal."
|
40 |
+
self.errmsg_negord = "Cannot treat negative num %s as ordinal."
|
41 |
+
self.errmsg_toobig = "abs(%s) must be less than %s."
|
42 |
+
|
43 |
+
self.setup()
|
44 |
+
|
45 |
+
# uses cards
|
46 |
+
if any(hasattr(self, field) for field in
|
47 |
+
['high_numwords', 'mid_numwords', 'low_numwords']):
|
48 |
+
self.cards = OrderedDict()
|
49 |
+
self.set_numwords()
|
50 |
+
self.MAXVAL = 1000 * list(self.cards.keys())[0]
|
51 |
+
|
52 |
+
def set_numwords(self):
|
53 |
+
self.set_high_numwords(self.high_numwords)
|
54 |
+
self.set_mid_numwords(self.mid_numwords)
|
55 |
+
self.set_low_numwords(self.low_numwords)
|
56 |
+
|
57 |
+
def set_high_numwords(self, *args):
|
58 |
+
raise NotImplementedError
|
59 |
+
|
60 |
+
def set_mid_numwords(self, mid):
|
61 |
+
for key, val in mid:
|
62 |
+
self.cards[key] = val
|
63 |
+
|
64 |
+
def set_low_numwords(self, numwords):
|
65 |
+
for word, n in zip(numwords, range(len(numwords) - 1, -1, -1)):
|
66 |
+
self.cards[n] = word
|
67 |
+
|
68 |
+
def splitnum(self, value):
|
69 |
+
for elem in self.cards:
|
70 |
+
if elem > value:
|
71 |
+
continue
|
72 |
+
|
73 |
+
out = []
|
74 |
+
if value == 0:
|
75 |
+
div, mod = 1, 0
|
76 |
+
else:
|
77 |
+
div, mod = divmod(value, elem)
|
78 |
+
|
79 |
+
if div == 1:
|
80 |
+
out.append((self.cards[1], 1))
|
81 |
+
else:
|
82 |
+
if div == value: # The system tallies, eg Roman Numerals
|
83 |
+
return [(div * self.cards[elem], div*elem)]
|
84 |
+
out.append(self.splitnum(div))
|
85 |
+
|
86 |
+
out.append((self.cards[elem], elem))
|
87 |
+
|
88 |
+
if mod:
|
89 |
+
out.append(self.splitnum(mod))
|
90 |
+
|
91 |
+
return out
|
92 |
+
|
93 |
+
def parse_minus(self, num_str):
|
94 |
+
"""Detach minus and return it as symbol with new num_str."""
|
95 |
+
if num_str.startswith('-'):
|
96 |
+
# Extra spacing to compensate if there is no minus.
|
97 |
+
return '%s ' % self.negword, num_str[1:]
|
98 |
+
return '', num_str
|
99 |
+
|
100 |
+
def str_to_number(self, value):
|
101 |
+
return Decimal(value)
|
102 |
+
|
103 |
+
def to_cardinal(self, value):
|
104 |
+
try:
|
105 |
+
assert int(value) == value
|
106 |
+
except (ValueError, TypeError, AssertionError):
|
107 |
+
return self.to_cardinal_float(value)
|
108 |
+
|
109 |
+
out = ""
|
110 |
+
if value < 0:
|
111 |
+
value = abs(value)
|
112 |
+
out = self.negword
|
113 |
+
|
114 |
+
if value >= self.MAXVAL:
|
115 |
+
raise OverflowError(self.errmsg_toobig % (value, self.MAXVAL))
|
116 |
+
|
117 |
+
val = self.splitnum(value)
|
118 |
+
words, num = self.clean(val)
|
119 |
+
return self.title(out + words)
|
120 |
+
|
121 |
+
def float2tuple(self, value):
|
122 |
+
pre = int(value)
|
123 |
+
|
124 |
+
# Simple way of finding decimal places to update the precision
|
125 |
+
self.precision = abs(Decimal(str(value)).as_tuple().exponent)
|
126 |
+
|
127 |
+
post = abs(value - pre) * 10**self.precision
|
128 |
+
if abs(round(post) - post) < 0.01:
|
129 |
+
# We generally floor all values beyond our precision (rather than
|
130 |
+
# rounding), but in cases where we have something like 1.239999999,
|
131 |
+
# which is probably due to python's handling of floats, we actually
|
132 |
+
# want to consider it as 1.24 instead of 1.23
|
133 |
+
post = int(round(post))
|
134 |
+
else:
|
135 |
+
post = int(math.floor(post))
|
136 |
+
|
137 |
+
return pre, post
|
138 |
+
|
139 |
+
def to_cardinal_float(self, value):
|
140 |
+
try:
|
141 |
+
float(value) == value
|
142 |
+
except (ValueError, TypeError, AssertionError, AttributeError):
|
143 |
+
raise TypeError(self.errmsg_nonnum % value)
|
144 |
+
|
145 |
+
pre, post = self.float2tuple(float(value))
|
146 |
+
|
147 |
+
post = str(post)
|
148 |
+
post = '0' * (self.precision - len(post)) + post
|
149 |
+
|
150 |
+
out = [self.to_cardinal(pre)]
|
151 |
+
if self.precision:
|
152 |
+
out.append(self.title(self.pointword))
|
153 |
+
|
154 |
+
for i in range(self.precision):
|
155 |
+
curr = int(post[i])
|
156 |
+
out.append(to_s(self.to_cardinal(curr)))
|
157 |
+
|
158 |
+
return " ".join(out)
|
159 |
+
|
160 |
+
def merge(self, curr, next):
|
161 |
+
raise NotImplementedError
|
162 |
+
|
163 |
+
def clean(self, val):
|
164 |
+
out = val
|
165 |
+
while len(val) != 1:
|
166 |
+
out = []
|
167 |
+
left, right = val[:2]
|
168 |
+
if isinstance(left, tuple) and isinstance(right, tuple):
|
169 |
+
out.append(self.merge(left, right))
|
170 |
+
if val[2:]:
|
171 |
+
out.append(val[2:])
|
172 |
+
else:
|
173 |
+
for elem in val:
|
174 |
+
if isinstance(elem, list):
|
175 |
+
if len(elem) == 1:
|
176 |
+
out.append(elem[0])
|
177 |
+
else:
|
178 |
+
out.append(self.clean(elem))
|
179 |
+
else:
|
180 |
+
out.append(elem)
|
181 |
+
val = out
|
182 |
+
return out[0]
|
183 |
+
|
184 |
+
def title(self, value):
|
185 |
+
if self.is_title:
|
186 |
+
out = []
|
187 |
+
value = value.split()
|
188 |
+
for word in value:
|
189 |
+
if word in self.exclude_title:
|
190 |
+
out.append(word)
|
191 |
+
else:
|
192 |
+
out.append(word[0].upper() + word[1:])
|
193 |
+
value = " ".join(out)
|
194 |
+
return value
|
195 |
+
|
196 |
+
def verify_ordinal(self, value):
|
197 |
+
if not value == int(value):
|
198 |
+
raise TypeError(self.errmsg_floatord % value)
|
199 |
+
if not abs(value) == value:
|
200 |
+
raise TypeError(self.errmsg_negord % value)
|
201 |
+
|
202 |
+
def to_ordinal(self, value):
|
203 |
+
return self.to_cardinal(value)
|
204 |
+
|
205 |
+
def to_ordinal_num(self, value):
|
206 |
+
return value
|
207 |
+
|
208 |
+
# Trivial version
|
209 |
+
def inflect(self, value, text):
|
210 |
+
text = text.split("/")
|
211 |
+
if value == 1:
|
212 |
+
return text[0]
|
213 |
+
return "".join(text)
|
214 |
+
|
215 |
+
# //CHECK: generalise? Any others like pounds/shillings/pence?
|
216 |
+
def to_splitnum(self, val, hightxt="", lowtxt="", jointxt="",
|
217 |
+
divisor=100, longval=True, cents=True):
|
218 |
+
out = []
|
219 |
+
|
220 |
+
if isinstance(val, float):
|
221 |
+
high, low = self.float2tuple(val)
|
222 |
+
else:
|
223 |
+
try:
|
224 |
+
high, low = val
|
225 |
+
except TypeError:
|
226 |
+
high, low = divmod(val, divisor)
|
227 |
+
|
228 |
+
if high:
|
229 |
+
hightxt = self.title(self.inflect(high, hightxt))
|
230 |
+
out.append(self.to_cardinal(high))
|
231 |
+
if low:
|
232 |
+
if longval:
|
233 |
+
if hightxt:
|
234 |
+
out.append(hightxt)
|
235 |
+
if jointxt:
|
236 |
+
out.append(self.title(jointxt))
|
237 |
+
elif hightxt:
|
238 |
+
out.append(hightxt)
|
239 |
+
|
240 |
+
if low:
|
241 |
+
if cents:
|
242 |
+
out.append(self.to_cardinal(low))
|
243 |
+
else:
|
244 |
+
out.append("%02d" % low)
|
245 |
+
if lowtxt and longval:
|
246 |
+
out.append(self.title(self.inflect(low, lowtxt)))
|
247 |
+
|
248 |
+
return " ".join(out)
|
249 |
+
|
250 |
+
def to_year(self, value, **kwargs):
|
251 |
+
return self.to_cardinal(value)
|
252 |
+
|
253 |
+
def pluralize(self, n, forms):
|
254 |
+
"""
|
255 |
+
Should resolve gettext form:
|
256 |
+
http://docs.translatehouse.org/projects/localization-guide/en/latest/l10n/pluralforms.html
|
257 |
+
"""
|
258 |
+
raise NotImplementedError
|
259 |
+
|
260 |
+
def _cents_verbose(self, number, currency):
|
261 |
+
return self.to_cardinal(number)
|
262 |
+
|
263 |
+
def _cents_terse(self, number, currency):
|
264 |
+
return "%02d" % number
|
265 |
+
|
266 |
+
def to_currency(self, val, currency='EUR', cents=True, separator=',',
|
267 |
+
adjective=False):
|
268 |
+
"""
|
269 |
+
Args:
|
270 |
+
val: Numeric value
|
271 |
+
currency (str): Currency code
|
272 |
+
cents (bool): Verbose cents
|
273 |
+
separator (str): Cent separator
|
274 |
+
adjective (bool): Prefix currency name with adjective
|
275 |
+
Returns:
|
276 |
+
str: Formatted string
|
277 |
+
|
278 |
+
"""
|
279 |
+
left, right, is_negative = parse_currency_parts(val)
|
280 |
+
|
281 |
+
try:
|
282 |
+
cr1, cr2 = self.CURRENCY_FORMS[currency]
|
283 |
+
|
284 |
+
except KeyError:
|
285 |
+
raise NotImplementedError(
|
286 |
+
'Currency code "%s" not implemented for "%s"' %
|
287 |
+
(currency, self.__class__.__name__))
|
288 |
+
|
289 |
+
if adjective and currency in self.CURRENCY_ADJECTIVES:
|
290 |
+
cr1 = prefix_currency(self.CURRENCY_ADJECTIVES[currency], cr1)
|
291 |
+
|
292 |
+
minus_str = "%s " % self.negword if is_negative else ""
|
293 |
+
cents_str = self._cents_verbose(right, currency) \
|
294 |
+
if cents else self._cents_terse(right, currency)
|
295 |
+
|
296 |
+
return u'%s%s %s%s %s %s' % (
|
297 |
+
minus_str,
|
298 |
+
self.to_cardinal(left),
|
299 |
+
self.pluralize(left, cr1),
|
300 |
+
separator,
|
301 |
+
cents_str,
|
302 |
+
self.pluralize(right, cr2)
|
303 |
+
)
|
304 |
+
|
305 |
+
def setup(self):
|
306 |
+
pass
|
num2words/compat.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
3 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
4 |
+
|
5 |
+
# This library is free software; you can redistribute it and/or
|
6 |
+
# modify it under the terms of the GNU Lesser General Public
|
7 |
+
# License as published by the Free Software Foundation; either
|
8 |
+
# version 2.1 of the License, or (at your option) any later version.
|
9 |
+
# This library is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12 |
+
# Lesser General Public License for more details.
|
13 |
+
# You should have received a copy of the GNU Lesser General Public
|
14 |
+
# License along with this library; if not, write to the Free Software
|
15 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
16 |
+
# MA 02110-1301 USA
|
17 |
+
|
18 |
+
|
19 |
+
try:
|
20 |
+
strtype = basestring
|
21 |
+
except NameError:
|
22 |
+
strtype = str
|
23 |
+
|
24 |
+
|
25 |
+
def to_s(val):
|
26 |
+
try:
|
27 |
+
return unicode(val)
|
28 |
+
except NameError:
|
29 |
+
return str(val)
|
num2words/currency.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
3 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
4 |
+
|
5 |
+
# This library is free software; you can redistribute it and/or
|
6 |
+
# modify it under the terms of the GNU Lesser General Public
|
7 |
+
# License as published by the Free Software Foundation; either
|
8 |
+
# version 2.1 of the License, or (at your option) any later version.
|
9 |
+
# This library is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12 |
+
# Lesser General Public License for more details.
|
13 |
+
# You should have received a copy of the GNU Lesser General Public
|
14 |
+
# License along with this library; if not, write to the Free Software
|
15 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
16 |
+
# MA 02110-1301 USA
|
17 |
+
|
18 |
+
from __future__ import division
|
19 |
+
|
20 |
+
from decimal import ROUND_HALF_UP, Decimal
|
21 |
+
|
22 |
+
|
23 |
+
def parse_currency_parts(value, is_int_with_cents=True):
|
24 |
+
if isinstance(value, int):
|
25 |
+
if is_int_with_cents:
|
26 |
+
# assume cents if value is integer
|
27 |
+
negative = value < 0
|
28 |
+
value = abs(value)
|
29 |
+
integer, cents = divmod(value, 100)
|
30 |
+
else:
|
31 |
+
negative = value < 0
|
32 |
+
integer, cents = abs(value), 0
|
33 |
+
|
34 |
+
else:
|
35 |
+
value = Decimal(value)
|
36 |
+
value = value.quantize(
|
37 |
+
Decimal('.01'),
|
38 |
+
rounding=ROUND_HALF_UP
|
39 |
+
)
|
40 |
+
negative = value < 0
|
41 |
+
value = abs(value)
|
42 |
+
integer, fraction = divmod(value, 1)
|
43 |
+
integer = int(integer)
|
44 |
+
cents = int(fraction * 100)
|
45 |
+
|
46 |
+
return integer, cents, negative
|
47 |
+
|
48 |
+
|
49 |
+
def prefix_currency(prefix, base):
|
50 |
+
return tuple("%s %s" % (prefix, i) for i in base)
|
num2words/lang_EU.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
3 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
4 |
+
|
5 |
+
# This library is free software; you can redistribute it and/or
|
6 |
+
# modify it under the terms of the GNU Lesser General Public
|
7 |
+
# License as published by the Free Software Foundation; either
|
8 |
+
# version 2.1 of the License, or (at your option) any later version.
|
9 |
+
# This library is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12 |
+
# Lesser General Public License for more details.
|
13 |
+
# You should have received a copy of the GNU Lesser General Public
|
14 |
+
# License along with this library; if not, write to the Free Software
|
15 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
16 |
+
# MA 02110-1301 USA
|
17 |
+
|
18 |
+
from __future__ import unicode_literals
|
19 |
+
|
20 |
+
from .base import Num2Word_Base
|
21 |
+
|
22 |
+
GENERIC_DOLLARS = ('dollar', 'dollars')
|
23 |
+
GENERIC_CENTS = ('cent', 'cents')
|
24 |
+
|
25 |
+
|
26 |
+
class Num2Word_EU(Num2Word_Base):
|
27 |
+
CURRENCY_FORMS = {
|
28 |
+
'AUD': (GENERIC_DOLLARS, GENERIC_CENTS),
|
29 |
+
'CAD': (GENERIC_DOLLARS, GENERIC_CENTS),
|
30 |
+
# repalced by EUR
|
31 |
+
'EEK': (('kroon', 'kroons'), ('sent', 'senti')),
|
32 |
+
'EUR': (('euro', 'euro'), GENERIC_CENTS),
|
33 |
+
'GBP': (('pound sterling', 'pounds sterling'), ('penny', 'pence')),
|
34 |
+
# replaced by EUR
|
35 |
+
'LTL': (('litas', 'litas'), GENERIC_CENTS),
|
36 |
+
# replaced by EUR
|
37 |
+
'LVL': (('lat', 'lats'), ('santim', 'santims')),
|
38 |
+
'USD': (GENERIC_DOLLARS, GENERIC_CENTS),
|
39 |
+
'RUB': (('rouble', 'roubles'), ('kopek', 'kopeks')),
|
40 |
+
'SEK': (('krona', 'kronor'), ('öre', 'öre')),
|
41 |
+
'NOK': (('krone', 'kroner'), ('øre', 'øre')),
|
42 |
+
'PLN': (('zloty', 'zlotys', 'zlotu'), ('grosz', 'groszy')),
|
43 |
+
'MXN': (('peso', 'pesos'), GENERIC_CENTS),
|
44 |
+
'RON': (('leu', 'lei', 'de lei'), ('ban', 'bani', 'de bani')),
|
45 |
+
'INR': (('rupee', 'rupees'), ('paisa', 'paise')),
|
46 |
+
'HUF': (('forint', 'forint'), ('fillér', 'fillér')),
|
47 |
+
'ISK': (('króna', 'krónur'), ('aur', 'aurar')),
|
48 |
+
}
|
49 |
+
|
50 |
+
CURRENCY_ADJECTIVES = {
|
51 |
+
'AUD': 'Australian',
|
52 |
+
'CAD': 'Canadian',
|
53 |
+
'EEK': 'Estonian',
|
54 |
+
'USD': 'US',
|
55 |
+
'RUB': 'Russian',
|
56 |
+
'NOK': 'Norwegian',
|
57 |
+
'MXN': 'Mexican',
|
58 |
+
'RON': 'Romanian',
|
59 |
+
'INR': 'Indian',
|
60 |
+
'HUF': 'Hungarian',
|
61 |
+
'ISK': 'íslenskar',
|
62 |
+
}
|
63 |
+
|
64 |
+
GIGA_SUFFIX = "illiard"
|
65 |
+
MEGA_SUFFIX = "illion"
|
66 |
+
|
67 |
+
def set_high_numwords(self, high):
|
68 |
+
cap = 3 + 6 * len(high)
|
69 |
+
|
70 |
+
for word, n in zip(high, range(cap, 3, -6)):
|
71 |
+
if self.GIGA_SUFFIX:
|
72 |
+
self.cards[10 ** n] = word + self.GIGA_SUFFIX
|
73 |
+
|
74 |
+
if self.MEGA_SUFFIX:
|
75 |
+
self.cards[10 ** (n - 3)] = word + self.MEGA_SUFFIX
|
76 |
+
|
77 |
+
def gen_high_numwords(self, units, tens, lows):
|
78 |
+
out = [u + t for t in tens for u in units]
|
79 |
+
out.reverse()
|
80 |
+
return out + lows
|
81 |
+
|
82 |
+
def pluralize(self, n, forms):
|
83 |
+
form = 0 if n == 1 else 1
|
84 |
+
return forms[form]
|
85 |
+
|
86 |
+
def setup(self):
|
87 |
+
lows = ["non", "oct", "sept", "sext", "quint", "quadr", "tr", "b", "m"]
|
88 |
+
units = ["", "un", "duo", "tre", "quattuor", "quin", "sex", "sept",
|
89 |
+
"octo", "novem"]
|
90 |
+
tens = ["dec", "vigint", "trigint", "quadragint", "quinquagint",
|
91 |
+
"sexagint", "septuagint", "octogint", "nonagint"]
|
92 |
+
self.high_numwords = ["cent"] + self.gen_high_numwords(units, tens,
|
93 |
+
lows)
|
num2words/lang_IS.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
4 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
5 |
+
|
6 |
+
# This library is free software; you can redistribute it and/or
|
7 |
+
# modify it under the terms of the GNU Lesser General Public
|
8 |
+
# License as published by the Free Software Foundation; either
|
9 |
+
# version 2.1 of the License, or (at your option) any later version.
|
10 |
+
# This library is distributed in the hope that it will be useful,
|
11 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13 |
+
# Lesser General Public License for more details.
|
14 |
+
# You should have received a copy of the GNU Lesser General Public
|
15 |
+
# License along with this library; if not, write to the Free Software
|
16 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
17 |
+
# MA 02110-1301 USA
|
18 |
+
|
19 |
+
from __future__ import division, print_function, unicode_literals
|
20 |
+
|
21 |
+
from . import lang_EU
|
22 |
+
|
23 |
+
# Genders
|
24 |
+
KK = 0 # Karlkyn (male)
|
25 |
+
KVK = 1 # Kvenkyn (female)
|
26 |
+
HK = 2 # Hvorugkyn (neuter)
|
27 |
+
|
28 |
+
GENDERS = {
|
29 |
+
"einn": ("einn", "ein", "eitt"),
|
30 |
+
"tveir": ("tveir", "tvær", "tvö"),
|
31 |
+
"þrír": ("þrír", "þrjár", "þrjú"),
|
32 |
+
"fjórir": ("fjórir", "fjórar", "fjögur"),
|
33 |
+
}
|
34 |
+
|
35 |
+
PLURALS = {
|
36 |
+
"hundrað": ("hundrað", "hundruð"),
|
37 |
+
}
|
38 |
+
|
39 |
+
|
40 |
+
class Num2Word_IS(lang_EU.Num2Word_EU):
|
41 |
+
|
42 |
+
GIGA_SUFFIX = "illjarður"
|
43 |
+
MEGA_SUFFIX = "illjón"
|
44 |
+
|
45 |
+
def setup(self):
|
46 |
+
lows = ["okt", "sept", "sext", "kvint", "kvaðr", "tr", "b", "m"]
|
47 |
+
self.high_numwords = self.gen_high_numwords([], [], lows)
|
48 |
+
|
49 |
+
self.negword = "mínus "
|
50 |
+
self.pointword = "komma"
|
51 |
+
|
52 |
+
# All words should be excluded, title case is not used in Icelandic
|
53 |
+
self.exclude_title = ["og", "komma", "mínus"]
|
54 |
+
|
55 |
+
self.mid_numwords = [(1000, "þúsund"), (100, "hundrað"),
|
56 |
+
(90, "níutíu"), (80, "áttatíu"), (70, "sjötíu"),
|
57 |
+
(60, "sextíu"), (50, "fimmtíu"), (40, "fjörutíu"),
|
58 |
+
(30, "þrjátíu")]
|
59 |
+
self.low_numwords = ["tuttugu", "nítján", "átján", "sautján",
|
60 |
+
"sextán", "fimmtán", "fjórtán", "þrettán",
|
61 |
+
"tólf", "ellefu", "tíu", "níu", "átta",
|
62 |
+
"sjö", "sex", "fimm", "fjórir", "þrír",
|
63 |
+
"tveir", "einn", "núll"]
|
64 |
+
self.ords = {"einn": "fyrsti",
|
65 |
+
"tveir": "annar",
|
66 |
+
"þrír": "þriðji",
|
67 |
+
"fjórir": "fjórði",
|
68 |
+
"fimm": "fimmti",
|
69 |
+
"sex": "sjötti",
|
70 |
+
"sjö": "sjöundi",
|
71 |
+
"átta": "áttundi",
|
72 |
+
"níu": "níundi",
|
73 |
+
"tíu": "tíundi",
|
74 |
+
"ellefu": "ellefti",
|
75 |
+
"tólf": "tólfti"}
|
76 |
+
|
77 |
+
def pluralize(self, n, noun):
|
78 |
+
form = 0 if (n % 10 == 1 and n % 100 != 11) else 1
|
79 |
+
if form == 0:
|
80 |
+
return noun
|
81 |
+
elif self.GIGA_SUFFIX in noun:
|
82 |
+
return noun.replace(self.GIGA_SUFFIX, "illjarðar")
|
83 |
+
elif self.MEGA_SUFFIX in noun:
|
84 |
+
return noun.replace(self.MEGA_SUFFIX, "illjónir")
|
85 |
+
elif noun not in PLURALS:
|
86 |
+
return noun
|
87 |
+
return PLURALS[noun][form]
|
88 |
+
|
89 |
+
def genderize(self, adj, noun):
|
90 |
+
last = adj.split()[-1]
|
91 |
+
if last not in GENDERS:
|
92 |
+
return adj
|
93 |
+
gender = KK
|
94 |
+
if "hund" in noun or "þús" in noun:
|
95 |
+
gender = HK
|
96 |
+
elif "illjarð" in noun:
|
97 |
+
gender = KK
|
98 |
+
elif "illjón" in noun:
|
99 |
+
gender = KVK
|
100 |
+
return adj.replace(last, GENDERS[last][gender])
|
101 |
+
|
102 |
+
def merge(self, lpair, rpair):
|
103 |
+
ltext, lnum = lpair
|
104 |
+
rtext, rnum = rpair
|
105 |
+
|
106 |
+
if lnum == 1 and rnum < 100:
|
107 |
+
return (rtext, rnum)
|
108 |
+
elif lnum < rnum:
|
109 |
+
rtext = self.pluralize(lnum, rtext)
|
110 |
+
ltext = self.genderize(ltext, rtext)
|
111 |
+
return ("%s %s" % (ltext, rtext), lnum * rnum)
|
112 |
+
elif lnum > rnum and rnum in self.cards:
|
113 |
+
rtext = self.pluralize(lnum, rtext)
|
114 |
+
ltext = self.genderize(ltext, rtext)
|
115 |
+
return ("%s og %s" % (ltext, rtext), lnum + rnum)
|
116 |
+
return ("%s %s" % (ltext, rtext), lnum + rnum)
|
117 |
+
|
118 |
+
def to_ordinal(self, value):
|
119 |
+
raise NotImplementedError
|
120 |
+
|
121 |
+
def to_ordinal_num(self, value):
|
122 |
+
raise NotImplementedError
|
123 |
+
|
124 |
+
def to_year(self, val, suffix=None, longval=True):
|
125 |
+
raise NotImplementedError
|
126 |
+
|
127 |
+
def to_currency(self, val, longval=True):
|
128 |
+
raise NotImplementedError
|
num2words/utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
|
3 |
+
# Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
|
4 |
+
|
5 |
+
# This library is free software; you can redistribute it and/or
|
6 |
+
# modify it under the terms of the GNU Lesser General Public
|
7 |
+
# License as published by the Free Software Foundation; either
|
8 |
+
# version 2.1 of the License, or (at your option) any later version.
|
9 |
+
# This library is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12 |
+
# Lesser General Public License for more details.
|
13 |
+
# You should have received a copy of the GNU Lesser General Public
|
14 |
+
# License along with this library; if not, write to the Free Software
|
15 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
16 |
+
# MA 02110-1301 USA
|
17 |
+
|
18 |
+
|
19 |
+
def splitbyx(n, x, format_int=True):
|
20 |
+
length = len(n)
|
21 |
+
if length > x:
|
22 |
+
start = length % x
|
23 |
+
if start > 0:
|
24 |
+
result = n[:start]
|
25 |
+
yield int(result) if format_int else result
|
26 |
+
for i in range(start, length, x):
|
27 |
+
result = n[i:i+x]
|
28 |
+
yield int(result) if format_int else result
|
29 |
+
else:
|
30 |
+
yield int(n) if format_int else n
|
31 |
+
|
32 |
+
|
33 |
+
def get_digits(n):
|
34 |
+
a = [int(x) for x in reversed(list(('%03d' % n)[-3:]))]
|
35 |
+
return a
|