infinitejoy
commited on
Commit
•
6916278
1
Parent(s):
58cfa8a
Upload bulgarian_training_script.ipynb
Browse files- bulgarian_training_script.ipynb +97 -15
bulgarian_training_script.ipynb
CHANGED
@@ -212,7 +212,8 @@
|
|
212 |
"base_uri": "https://localhost:8080/"
|
213 |
},
|
214 |
"id": "Mz4bubhxxsad",
|
215 |
-
"outputId": "23398525-cc19-43c2-9fec-497e06214f29"
|
|
|
216 |
},
|
217 |
"outputs": [
|
218 |
{
|
@@ -1898,58 +1899,139 @@
|
|
1898 |
},
|
1899 |
{
|
1900 |
"cell_type": "code",
|
1901 |
-
"execution_count":
|
1902 |
"metadata": {},
|
1903 |
"outputs": [
|
1904 |
{
|
1905 |
"name": "stderr",
|
1906 |
"output_type": "stream",
|
1907 |
"text": [
|
1908 |
-
"Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/
|
1909 |
]
|
1910 |
},
|
1911 |
{
|
1912 |
"data": {
|
1913 |
"application/vnd.jupyter.widget-view+json": {
|
1914 |
-
"model_id": "
|
1915 |
"version_major": 2,
|
1916 |
"version_minor": 0
|
1917 |
},
|
1918 |
"text/plain": [
|
1919 |
-
"
|
1920 |
]
|
1921 |
},
|
1922 |
"metadata": {},
|
1923 |
"output_type": "display_data"
|
1924 |
},
|
1925 |
{
|
1926 |
-
"
|
1927 |
-
|
1928 |
-
|
1929 |
-
|
1930 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1931 |
},
|
1932 |
{
|
1933 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
1934 |
"text/plain": [
|
1935 |
-
"
|
1936 |
]
|
1937 |
},
|
1938 |
-
"execution_count": 25,
|
1939 |
"metadata": {},
|
1940 |
-
"output_type": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1941 |
}
|
1942 |
],
|
1943 |
"source": [
|
1944 |
"from datasets import Audio, Dataset, load_dataset, load_metric\n",
|
1945 |
"from transformers import AutoFeatureExtractor, pipeline\n",
|
1946 |
"\n",
|
1947 |
-
"dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"
|
1948 |
"\n",
|
1949 |
"# for testing: only process the first two examples as a test\n",
|
1950 |
"dataset = dataset.select(range(10))\n",
|
1951 |
"\n",
|
1952 |
-
"repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-
|
1953 |
"\n",
|
1954 |
"# load processor\n",
|
1955 |
"feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n",
|
|
|
212 |
"base_uri": "https://localhost:8080/"
|
213 |
},
|
214 |
"id": "Mz4bubhxxsad",
|
215 |
+
"outputId": "23398525-cc19-43c2-9fec-497e06214f29",
|
216 |
+
"scrolled": true
|
217 |
},
|
218 |
"outputs": [
|
219 |
{
|
|
|
1899 |
},
|
1900 |
{
|
1901 |
"cell_type": "code",
|
1902 |
+
"execution_count": null,
|
1903 |
"metadata": {},
|
1904 |
"outputs": [
|
1905 |
{
|
1906 |
"name": "stderr",
|
1907 |
"output_type": "stream",
|
1908 |
"text": [
|
1909 |
+
"Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bg/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
|
1910 |
]
|
1911 |
},
|
1912 |
{
|
1913 |
"data": {
|
1914 |
"application/vnd.jupyter.widget-view+json": {
|
1915 |
+
"model_id": "16eaba3c65494a12921df272388262b6",
|
1916 |
"version_major": 2,
|
1917 |
"version_minor": 0
|
1918 |
},
|
1919 |
"text/plain": [
|
1920 |
+
"Downloading: 0%| | 0.00/1.99k [00:00<?, ?B/s]"
|
1921 |
]
|
1922 |
},
|
1923 |
"metadata": {},
|
1924 |
"output_type": "display_data"
|
1925 |
},
|
1926 |
{
|
1927 |
+
"data": {
|
1928 |
+
"application/vnd.jupyter.widget-view+json": {
|
1929 |
+
"model_id": "a7d77fd025284c198de8148a66ac6e23",
|
1930 |
+
"version_major": 2,
|
1931 |
+
"version_minor": 0
|
1932 |
+
},
|
1933 |
+
"text/plain": [
|
1934 |
+
"Downloading: 0%| | 0.00/212 [00:00<?, ?B/s]"
|
1935 |
+
]
|
1936 |
+
},
|
1937 |
+
"metadata": {},
|
1938 |
+
"output_type": "display_data"
|
1939 |
+
},
|
1940 |
+
{
|
1941 |
+
"data": {
|
1942 |
+
"application/vnd.jupyter.widget-view+json": {
|
1943 |
+
"model_id": "b42cb83436dc43e5aec83d4bc1b20317",
|
1944 |
+
"version_major": 2,
|
1945 |
+
"version_minor": 0
|
1946 |
+
},
|
1947 |
+
"text/plain": [
|
1948 |
+
"Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
|
1949 |
+
]
|
1950 |
+
},
|
1951 |
+
"metadata": {},
|
1952 |
+
"output_type": "display_data"
|
1953 |
},
|
1954 |
{
|
1955 |
"data": {
|
1956 |
+
"application/vnd.jupyter.widget-view+json": {
|
1957 |
+
"model_id": "0dc58dd4cf5542a4b5627d0748dd1574",
|
1958 |
+
"version_major": 2,
|
1959 |
+
"version_minor": 0
|
1960 |
+
},
|
1961 |
"text/plain": [
|
1962 |
+
"Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
|
1963 |
]
|
1964 |
},
|
|
|
1965 |
"metadata": {},
|
1966 |
+
"output_type": "display_data"
|
1967 |
+
},
|
1968 |
+
{
|
1969 |
+
"data": {
|
1970 |
+
"application/vnd.jupyter.widget-view+json": {
|
1971 |
+
"model_id": "c39a37f06df548caa547615d9edc8450",
|
1972 |
+
"version_major": 2,
|
1973 |
+
"version_minor": 0
|
1974 |
+
},
|
1975 |
+
"text/plain": [
|
1976 |
+
"Downloading: 0%| | 0.00/384 [00:00<?, ?B/s]"
|
1977 |
+
]
|
1978 |
+
},
|
1979 |
+
"metadata": {},
|
1980 |
+
"output_type": "display_data"
|
1981 |
+
},
|
1982 |
+
{
|
1983 |
+
"data": {
|
1984 |
+
"application/vnd.jupyter.widget-view+json": {
|
1985 |
+
"model_id": "90f133429122486ebac0db594fc133a2",
|
1986 |
+
"version_major": 2,
|
1987 |
+
"version_minor": 0
|
1988 |
+
},
|
1989 |
+
"text/plain": [
|
1990 |
+
"Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
|
1991 |
+
]
|
1992 |
+
},
|
1993 |
+
"metadata": {},
|
1994 |
+
"output_type": "display_data"
|
1995 |
+
},
|
1996 |
+
{
|
1997 |
+
"data": {
|
1998 |
+
"application/vnd.jupyter.widget-view+json": {
|
1999 |
+
"model_id": "2ac0d8c0256d4470a28d8d7a2c451990",
|
2000 |
+
"version_major": 2,
|
2001 |
+
"version_minor": 0
|
2002 |
+
},
|
2003 |
+
"text/plain": [
|
2004 |
+
"Downloading: 0%| | 0.00/309 [00:00<?, ?B/s]"
|
2005 |
+
]
|
2006 |
+
},
|
2007 |
+
"metadata": {},
|
2008 |
+
"output_type": "display_data"
|
2009 |
+
},
|
2010 |
+
{
|
2011 |
+
"data": {
|
2012 |
+
"application/vnd.jupyter.widget-view+json": {
|
2013 |
+
"model_id": "7a6b55c283d44d1ca6100799737eccbc",
|
2014 |
+
"version_major": 2,
|
2015 |
+
"version_minor": 0
|
2016 |
+
},
|
2017 |
+
"text/plain": [
|
2018 |
+
" 0%| | 0/10 [00:00<?, ?ex/s]"
|
2019 |
+
]
|
2020 |
+
},
|
2021 |
+
"metadata": {},
|
2022 |
+
"output_type": "display_data"
|
2023 |
}
|
2024 |
],
|
2025 |
"source": [
|
2026 |
"from datasets import Audio, Dataset, load_dataset, load_metric\n",
|
2027 |
"from transformers import AutoFeatureExtractor, pipeline\n",
|
2028 |
"\n",
|
2029 |
+
"dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"bg\", use_auth_token=True, split=\"train+validation\")\n",
|
2030 |
"\n",
|
2031 |
"# for testing: only process the first two examples as a test\n",
|
2032 |
"dataset = dataset.select(range(10))\n",
|
2033 |
"\n",
|
2034 |
+
"repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-bulgarian'\n",
|
2035 |
"\n",
|
2036 |
"# load processor\n",
|
2037 |
"feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n",
|