add CV 11
Browse files- README.template.md +2 -2
- generate_datasets.py +6 -0
- languages.ftl +8 -0
- test.py +1 -1
README.template.md
CHANGED
@@ -131,7 +131,7 @@ Additional fields include `accent`, `age`, `client_id`, `up_votes`, `down_votes`
|
|
131 |
|
132 |
The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
|
133 |
|
134 |
-
The validated data is data that has been validated with reviewers and
|
135 |
|
136 |
The invalidated data is data has been invalidated by reviewers
|
137 |
and received downvotes indicating that the data is of low quality.
|
@@ -153,7 +153,7 @@ In addition, the majority of training sentences end in punctuation ( . or ? or !
|
|
153 |
```python
|
154 |
from datasets import load_dataset
|
155 |
|
156 |
-
ds = load_dataset("mozilla-foundation/{{
|
157 |
|
158 |
def prepare_dataset(batch):
|
159 |
"""Function to preprocess the dataset with the .map method"""
|
|
|
131 |
|
132 |
The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
|
133 |
|
134 |
+
The validated data is data that has been validated with reviewers and received upvotes that the data is of high quality.
|
135 |
|
136 |
The invalidated data is data has been invalidated by reviewers
|
137 |
and received downvotes indicating that the data is of low quality.
|
|
|
153 |
```python
|
154 |
from datasets import load_dataset
|
155 |
|
156 |
+
ds = load_dataset("mozilla-foundation/{{DATASET_PATH}}", "en", use_auth_token=True)
|
157 |
|
158 |
def prepare_dataset(batch):
|
159 |
"""Function to preprocess the dataset with the .map method"""
|
generate_datasets.py
CHANGED
@@ -54,6 +54,11 @@ VERSIONS = [
|
|
54 |
"name": "common_voice_10_0",
|
55 |
"release": "cv-corpus-10.0-2022-07-04",
|
56 |
},
|
|
|
|
|
|
|
|
|
|
|
57 |
]
|
58 |
|
59 |
|
@@ -102,6 +107,7 @@ def main():
|
|
102 |
with open(f"README.template.md", "r") as fin:
|
103 |
readme = fin.read()
|
104 |
readme = readme.replace("{{NAME}}", release_stats["name"])
|
|
|
105 |
|
106 |
locales = sorted(release_stats["locales"].keys())
|
107 |
languages = [f"- {loc}" for loc in locales]
|
|
|
54 |
"name": "common_voice_10_0",
|
55 |
"release": "cv-corpus-10.0-2022-07-04",
|
56 |
},
|
57 |
+
{
|
58 |
+
"semver": "11.0.0",
|
59 |
+
"name": "common_voice_11_0",
|
60 |
+
"release": "cv-corpus-11.0-2022-09-21",
|
61 |
+
},
|
62 |
]
|
63 |
|
64 |
|
|
|
107 |
with open(f"README.template.md", "r") as fin:
|
108 |
readme = fin.read()
|
109 |
readme = readme.replace("{{NAME}}", release_stats["name"])
|
110 |
+
readme = readme.replace("{{DATASET_PATH}}", version["name"])
|
111 |
|
112 |
locales = sorted(release_stats["locales"].keys())
|
113 |
languages = [f"- {loc}" for loc in locales]
|
languages.ftl
CHANGED
@@ -49,6 +49,7 @@ gom = Goan Konkani
|
|
49 |
ha = Hausa
|
50 |
he = Hebrew
|
51 |
hi = Hindi
|
|
|
52 |
hr = Croatian
|
53 |
hsb = Sorbian, Upper
|
54 |
ht = Haitian
|
@@ -63,6 +64,7 @@ is = Icelandic
|
|
63 |
it = Italian
|
64 |
izh = Izhorian
|
65 |
ja = Japanese
|
|
|
66 |
ka = Georgian
|
67 |
kaa = Karakalpak
|
68 |
kab = Kabyle
|
@@ -71,6 +73,7 @@ ki = Kikuyu
|
|
71 |
kk = Kazakh
|
72 |
km = Khmer
|
73 |
kmr = Kurmanji Kurdish
|
|
|
74 |
knn = Konkani (Devanagari)
|
75 |
ko = Korean
|
76 |
kpv = Komi-Zyrian
|
@@ -79,6 +82,8 @@ ky = Kyrgyz
|
|
79 |
lb = Luxembourgish
|
80 |
lg = Luganda
|
81 |
lij = Ligurian
|
|
|
|
|
82 |
lt = Lithuanian
|
83 |
lv = Latvian
|
84 |
mai = Maithili
|
@@ -125,11 +130,13 @@ sah = Sakha
|
|
125 |
sat = Santali (Ol Chiki)
|
126 |
sc = Sardinian
|
127 |
scn = Sicilian
|
|
|
128 |
shi = Shilha
|
129 |
si = Sinhala
|
130 |
sk = Slovak
|
131 |
skr = Saraiki
|
132 |
sl = Slovenian
|
|
|
133 |
so = Somali
|
134 |
sq = Albanian
|
135 |
sr = Serbian
|
@@ -167,6 +174,7 @@ xh = Xhosa
|
|
167 |
yi = Yiddish
|
168 |
yo = Yoruba
|
169 |
yue = Cantonese
|
|
|
170 |
zh-CN = Chinese (China)
|
171 |
zh-HK = Chinese (Hong Kong)
|
172 |
zh-TW = Chinese (Taiwan)
|
|
|
49 |
ha = Hausa
|
50 |
he = Hebrew
|
51 |
hi = Hindi
|
52 |
+
hil = Hiligaynon
|
53 |
hr = Croatian
|
54 |
hsb = Sorbian, Upper
|
55 |
ht = Haitian
|
|
|
64 |
it = Italian
|
65 |
izh = Izhorian
|
66 |
ja = Japanese
|
67 |
+
jbo = Lojban
|
68 |
ka = Georgian
|
69 |
kaa = Karakalpak
|
70 |
kab = Kabyle
|
|
|
73 |
kk = Kazakh
|
74 |
km = Khmer
|
75 |
kmr = Kurmanji Kurdish
|
76 |
+
kn = Kannada
|
77 |
knn = Konkani (Devanagari)
|
78 |
ko = Korean
|
79 |
kpv = Komi-Zyrian
|
|
|
82 |
lb = Luxembourgish
|
83 |
lg = Luganda
|
84 |
lij = Ligurian
|
85 |
+
ln = Lingala
|
86 |
+
lo = Lao
|
87 |
lt = Lithuanian
|
88 |
lv = Latvian
|
89 |
mai = Maithili
|
|
|
130 |
sat = Santali (Ol Chiki)
|
131 |
sc = Sardinian
|
132 |
scn = Sicilian
|
133 |
+
sdh = Southern Kurdish
|
134 |
shi = Shilha
|
135 |
si = Sinhala
|
136 |
sk = Slovak
|
137 |
skr = Saraiki
|
138 |
sl = Slovenian
|
139 |
+
snk = Soninke
|
140 |
so = Somali
|
141 |
sq = Albanian
|
142 |
sr = Serbian
|
|
|
174 |
yi = Yiddish
|
175 |
yo = Yoruba
|
176 |
yue = Cantonese
|
177 |
+
zgh = Tamazight
|
178 |
zh-CN = Chinese (China)
|
179 |
zh-HK = Chinese (Hong Kong)
|
180 |
zh-TW = Chinese (Taiwan)
|
test.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from datasets import load_dataset
|
2 |
|
3 |
-
dataset = load_dataset("
|
4 |
print(dataset)
|
5 |
print(dataset[100])
|
|
|
1 |
from datasets import load_dataset
|
2 |
|
3 |
+
dataset = load_dataset("./common_voice_11_0", "et", split="test", use_auth_token=True)
|
4 |
print(dataset)
|
5 |
print(dataset[100])
|