wasertech DrishtiSharma commited on
Commit
11fc755
0 Parent(s):

Duplicate from DrishtiSharma/ASR_using_Wav2Vec2

Browse files

Co-authored-by: Drishti Sharma <DrishtiSharma@users.noreply.huggingface.co>

Files changed (8) hide show
  1. .gitattributes +27 -0
  2. README.md +38 -0
  3. Test_File.wav +0 -0
  4. Test_File1.wav +0 -0
  5. Test_File2.wav +0 -0
  6. Test_File3.wav +0 -0
  7. app.py +67 -0
  8. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ASR using Wav2Vec 2.0
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: DrishtiSharma/ASR_using_Wav2Vec2
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio` or `streamlit`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `pinned`: _boolean_
38
+ Whether the Space stays on top of your list.
Test_File.wav ADDED
Binary file (457 kB). View file
 
Test_File1.wav ADDED
Binary file (142 kB). View file
 
Test_File2.wav ADDED
Binary file (360 kB). View file
 
Test_File3.wav ADDED
Binary file (311 kB). View file
 
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #References: 1. https://www.kdnuggets.com/2021/03/speech-text-wav2vec.html
2
+ #2. https://www.youtube.com/watch?v=4CoVcsxZphE
3
+ #3. https://www.analyticsvidhya.com/blog/2021/02/hugging-face-introduces-the-first-automatic-speech-recognition-model-wav2vec2/
4
+
5
+ #Importing all the necessary packages
6
+ import nltk
7
+ import librosa
8
+ import torch
9
+ import gradio as gr
10
+ from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
11
+ nltk.download("punkt")
12
+
13
+ #Loading the model and the tokenizer
14
+ model_name = "facebook/wav2vec2-base-960h"
15
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
16
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
17
+
18
+
19
+ def load_data(input_file):
20
+
21
+ """ Function for resampling to ensure that the speech input is sampled at 16KHz.
22
+ """
23
+ #read the file
24
+ speech, sample_rate = librosa.load(input_file)
25
+ #make it 1-D
26
+ if len(speech.shape) > 1:
27
+ speech = speech[:,0] + speech[:,1]
28
+ #Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
29
+ if sample_rate !=16000:
30
+ speech = librosa.resample(speech, sample_rate,16000)
31
+ return speech
32
+
33
+
34
+
35
+ def correct_casing(input_sentence):
36
+ """ This function is for correcting the casing of the generated transcribed text
37
+ """
38
+ sentences = nltk.sent_tokenize(input_sentence)
39
+ return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
40
+
41
+
42
+
43
+ def asr_transcript(input_file):
44
+ """This function generates transcripts for the provided audio input
45
+ """
46
+ speech = load_data(input_file)
47
+
48
+ #Tokenize
49
+ input_values = tokenizer(speech, return_tensors="pt").input_values
50
+ #Take logits
51
+ logits = model(input_values).logits
52
+ #Take argmax
53
+ predicted_ids = torch.argmax(logits, dim=-1)
54
+ #Get the words from predicted word ids
55
+ transcription = tokenizer.decode(predicted_ids[0])
56
+ #Output is all upper case
57
+ transcription = correct_casing(transcription.lower())
58
+ return transcription
59
+
60
+
61
+ gr.Interface(asr_transcript,
62
+ inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Please record your voice"),
63
+ outputs = gr.outputs.Textbox(label="Output Text"),
64
+ title="ASR using Wav2Vec 2.0",
65
+ description = "This application displays transcribed text for given audio input",
66
+ examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()
67
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ nltk
2
+ transformers
3
+ torch
4
+ librosa