patrickvonplaten commited on
Commit
3141b41
1 Parent(s): c102c06

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -5
README.md CHANGED
@@ -48,7 +48,7 @@ model_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
48
  device = "cuda"
49
  processor_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
50
 
51
- chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]"
52
 
53
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
54
  processor = Wav2Vec2Processor.from_pretrained(processor_name)
@@ -97,7 +97,6 @@ CER calculation refer to https://huggingface.co/ctl/wav2vec2-large-xlsr-cantones
97
 
98
  ```python
99
  !mkdir cer
100
- !wget -O cer/cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py
101
  !pip install jiwer
102
 
103
  import torchaudio
@@ -114,12 +113,12 @@ model_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
114
  device = "cuda"
115
  processor_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
116
 
117
- chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]"
118
 
119
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
120
  processor = Wav2Vec2Processor.from_pretrained(processor_name)
121
 
122
- ds = load_dataset("common_voice", 'zh-TW', data_dir="./cv-corpus-6.1-2020-12-11", split="test")
123
 
124
  resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
125
 
@@ -171,7 +170,7 @@ from transformers import AutoTokenizer, AutoModelWithLMHead
171
  model_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
172
  device = "cuda"
173
  processor_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
174
- chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]"
175
 
176
  tokenizer = AutoTokenizer.from_pretrained("ckiplab/gpt2-base-chinese")
177
  gpt_model = AutoModelWithLMHead.from_pretrained("ckiplab/gpt2-base-chinese").to(device)
 
48
  device = "cuda"
49
  processor_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
50
 
51
+ chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\\"#$%&()*+,\\-.\\:;<=>?@\\[\\]\\\\\\/^_`{|}~]"
52
 
53
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
54
  processor = Wav2Vec2Processor.from_pretrained(processor_name)
 
97
 
98
  ```python
99
  !mkdir cer
 
100
  !pip install jiwer
101
 
102
  import torchaudio
 
113
  device = "cuda"
114
  processor_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
115
 
116
+ chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\\"#$%&()*+,\\-.\\:;<=>?@\\[\\]\\\\\\/^_`{|}~]"
117
 
118
  model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
119
  processor = Wav2Vec2Processor.from_pretrained(processor_name)
120
 
121
+ ds = load_dataset("common_voice", 'zh-TW', split="test")
122
 
123
  resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
124
 
 
170
  model_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
171
  device = "cuda"
172
  processor_name = "voidful/wav2vec2-large-xlsr-53-tw-gpt"
173
+ chars_to_ignore_regex = r"[¥•"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'℃°•·.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\\"#$%&()*+,\\-.\\:;<=>?@\\[\\]\\\\\\/^_`{|}~]"
174
 
175
  tokenizer = AutoTokenizer.from_pretrained("ckiplab/gpt2-base-chinese")
176
  gpt_model = AutoModelWithLMHead.from_pretrained("ckiplab/gpt2-base-chinese").to(device)