infinitejoy commited on
Commit
6916278
1 Parent(s): 58cfa8a

Upload bulgarian_training_script.ipynb

Browse files
Files changed (1) hide show
  1. bulgarian_training_script.ipynb +97 -15
bulgarian_training_script.ipynb CHANGED
@@ -212,7 +212,8 @@
212
  "base_uri": "https://localhost:8080/"
213
  },
214
  "id": "Mz4bubhxxsad",
215
- "outputId": "23398525-cc19-43c2-9fec-497e06214f29"
 
216
  },
217
  "outputs": [
218
  {
@@ -1898,58 +1899,139 @@
1898
  },
1899
  {
1900
  "cell_type": "code",
1901
- "execution_count": 25,
1902
  "metadata": {},
1903
  "outputs": [
1904
  {
1905
  "name": "stderr",
1906
  "output_type": "stream",
1907
  "text": [
1908
- "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ba/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
1909
  ]
1910
  },
1911
  {
1912
  "data": {
1913
  "application/vnd.jupyter.widget-view+json": {
1914
- "model_id": "400e81b47cf84505b84756bd0cdb853f",
1915
  "version_major": 2,
1916
  "version_minor": 0
1917
  },
1918
  "text/plain": [
1919
- " 0%| | 0/10 [00:00<?, ?ex/s]"
1920
  ]
1921
  },
1922
  "metadata": {},
1923
  "output_type": "display_data"
1924
  },
1925
  {
1926
- "name": "stdout",
1927
- "output_type": "stream",
1928
- "text": [
1929
- "['шағир дуҫымдың һүҙҙәрен ишеткәс ҡыуанып киттем', 'йөҙөҙ һағышлы', 'тик ниңәлер ул һуңғы йылдарҙа йыш ауырый башланы', 'нимә һас йөрөйгегөҙм әллә', 'оятпым хурлыҡ', 'ҡараһаң атай әсәйҙең ҙур фортаһын ҡулына алған да текәлеп тиҙ бултыра', 'һин дә хан балаһы түгел', 'юҡ миңә маддоннаның уң яҡҡабырғаһында бәүелеп', 'ә бына иляста кәше итеп ҡуйы бик шәп иткәнһең', 'беҙ быларҙы кемгә генә маҡтанып күрһәтмәнек']\n"
1930
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1931
  },
1932
  {
1933
  "data": {
 
 
 
 
 
1934
  "text/plain": [
1935
- "'Шағир дуҫымдың һүҙҙәрен ишеткәс, ҡыуанып киттем.'"
1936
  ]
1937
  },
1938
- "execution_count": 25,
1939
  "metadata": {},
1940
- "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1941
  }
1942
  ],
1943
  "source": [
1944
  "from datasets import Audio, Dataset, load_dataset, load_metric\n",
1945
  "from transformers import AutoFeatureExtractor, pipeline\n",
1946
  "\n",
1947
- "dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"ba\", use_auth_token=True, split=\"train+validation\")\n",
1948
  "\n",
1949
  "# for testing: only process the first two examples as a test\n",
1950
  "dataset = dataset.select(range(10))\n",
1951
  "\n",
1952
- "repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-bashkir'\n",
1953
  "\n",
1954
  "# load processor\n",
1955
  "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n",
 
212
  "base_uri": "https://localhost:8080/"
213
  },
214
  "id": "Mz4bubhxxsad",
215
+ "outputId": "23398525-cc19-43c2-9fec-497e06214f29",
216
+ "scrolled": true
217
  },
218
  "outputs": [
219
  {
 
1899
  },
1900
  {
1901
  "cell_type": "code",
1902
+ "execution_count": null,
1903
  "metadata": {},
1904
  "outputs": [
1905
  {
1906
  "name": "stderr",
1907
  "output_type": "stream",
1908
  "text": [
1909
+ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bg/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
1910
  ]
1911
  },
1912
  {
1913
  "data": {
1914
  "application/vnd.jupyter.widget-view+json": {
1915
+ "model_id": "16eaba3c65494a12921df272388262b6",
1916
  "version_major": 2,
1917
  "version_minor": 0
1918
  },
1919
  "text/plain": [
1920
+ "Downloading: 0%| | 0.00/1.99k [00:00<?, ?B/s]"
1921
  ]
1922
  },
1923
  "metadata": {},
1924
  "output_type": "display_data"
1925
  },
1926
  {
1927
+ "data": {
1928
+ "application/vnd.jupyter.widget-view+json": {
1929
+ "model_id": "a7d77fd025284c198de8148a66ac6e23",
1930
+ "version_major": 2,
1931
+ "version_minor": 0
1932
+ },
1933
+ "text/plain": [
1934
+ "Downloading: 0%| | 0.00/212 [00:00<?, ?B/s]"
1935
+ ]
1936
+ },
1937
+ "metadata": {},
1938
+ "output_type": "display_data"
1939
+ },
1940
+ {
1941
+ "data": {
1942
+ "application/vnd.jupyter.widget-view+json": {
1943
+ "model_id": "b42cb83436dc43e5aec83d4bc1b20317",
1944
+ "version_major": 2,
1945
+ "version_minor": 0
1946
+ },
1947
+ "text/plain": [
1948
+ "Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
1949
+ ]
1950
+ },
1951
+ "metadata": {},
1952
+ "output_type": "display_data"
1953
  },
1954
  {
1955
  "data": {
1956
+ "application/vnd.jupyter.widget-view+json": {
1957
+ "model_id": "0dc58dd4cf5542a4b5627d0748dd1574",
1958
+ "version_major": 2,
1959
+ "version_minor": 0
1960
+ },
1961
  "text/plain": [
1962
+ "Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
1963
  ]
1964
  },
 
1965
  "metadata": {},
1966
+ "output_type": "display_data"
1967
+ },
1968
+ {
1969
+ "data": {
1970
+ "application/vnd.jupyter.widget-view+json": {
1971
+ "model_id": "c39a37f06df548caa547615d9edc8450",
1972
+ "version_major": 2,
1973
+ "version_minor": 0
1974
+ },
1975
+ "text/plain": [
1976
+ "Downloading: 0%| | 0.00/384 [00:00<?, ?B/s]"
1977
+ ]
1978
+ },
1979
+ "metadata": {},
1980
+ "output_type": "display_data"
1981
+ },
1982
+ {
1983
+ "data": {
1984
+ "application/vnd.jupyter.widget-view+json": {
1985
+ "model_id": "90f133429122486ebac0db594fc133a2",
1986
+ "version_major": 2,
1987
+ "version_minor": 0
1988
+ },
1989
+ "text/plain": [
1990
+ "Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
1991
+ ]
1992
+ },
1993
+ "metadata": {},
1994
+ "output_type": "display_data"
1995
+ },
1996
+ {
1997
+ "data": {
1998
+ "application/vnd.jupyter.widget-view+json": {
1999
+ "model_id": "2ac0d8c0256d4470a28d8d7a2c451990",
2000
+ "version_major": 2,
2001
+ "version_minor": 0
2002
+ },
2003
+ "text/plain": [
2004
+ "Downloading: 0%| | 0.00/309 [00:00<?, ?B/s]"
2005
+ ]
2006
+ },
2007
+ "metadata": {},
2008
+ "output_type": "display_data"
2009
+ },
2010
+ {
2011
+ "data": {
2012
+ "application/vnd.jupyter.widget-view+json": {
2013
+ "model_id": "7a6b55c283d44d1ca6100799737eccbc",
2014
+ "version_major": 2,
2015
+ "version_minor": 0
2016
+ },
2017
+ "text/plain": [
2018
+ " 0%| | 0/10 [00:00<?, ?ex/s]"
2019
+ ]
2020
+ },
2021
+ "metadata": {},
2022
+ "output_type": "display_data"
2023
  }
2024
  ],
2025
  "source": [
2026
  "from datasets import Audio, Dataset, load_dataset, load_metric\n",
2027
  "from transformers import AutoFeatureExtractor, pipeline\n",
2028
  "\n",
2029
+ "dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"bg\", use_auth_token=True, split=\"train+validation\")\n",
2030
  "\n",
2031
  "# for testing: only process the first two examples as a test\n",
2032
  "dataset = dataset.select(range(10))\n",
2033
  "\n",
2034
+ "repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-bulgarian'\n",
2035
  "\n",
2036
  "# load processor\n",
2037
  "feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)\n",