Siddhant commited on
Commit
f7f39bd
1 Parent(s): 5e6b5bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -83
app.py CHANGED
@@ -1,105 +1,130 @@
 
1
  from transformers import pipeline
2
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
5
 
6
- classifier = pipeline(
7
- "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
8
- )
9
 
10
- from transformers.pipelines.audio_utils import ffmpeg_microphone_live
11
-
12
-
13
- def launch_fn(
14
- wake_word="marvin",
15
- prob_threshold=0.5,
16
- chunk_length_s=2.0,
17
- stream_chunk_s=0.25,
18
- debug=False,
19
- ):
20
- if wake_word not in classifier.model.config.label2id.keys():
21
- raise ValueError(
22
- f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
23
- )
24
-
25
- sampling_rate = classifier.feature_extractor.sampling_rate
26
-
27
- mic = ffmpeg_microphone_live(
28
- sampling_rate=sampling_rate,
29
- chunk_length_s=chunk_length_s,
30
- stream_chunk_s=stream_chunk_s,
31
- )
32
-
33
- print("Listening for wake word...")
34
- for prediction in classifier(mic):
35
- prediction = prediction[0]
36
- if debug:
37
- print(prediction)
38
- if prediction["label"] == wake_word:
39
- if prediction["score"] > prob_threshold:
40
- return True
41
-
42
- transcriber = pipeline(
43
- "automatic-speech-recognition", model="openai/whisper-base.en", device=device
44
- )
45
- import sys
 
 
 
 
46
 
47
 
48
- def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
49
- sampling_rate = transcriber.feature_extractor.sampling_rate
50
 
51
- mic = ffmpeg_microphone_live(
52
- sampling_rate=sampling_rate,
53
- chunk_length_s=chunk_length_s,
54
- stream_chunk_s=stream_chunk_s,
55
- )
56
 
57
- print("Start speaking...")
58
- for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
59
- sys.stdout.write("\033[K")
60
- print(item["text"], end="\r")
61
- if not item["partial"][0]:
62
- break
63
 
64
- return item["text"]
65
 
66
- from huggingface_hub import HfFolder
67
- import requests
68
 
69
 
70
- def query(text, model_id="tiiuae/falcon-7b-instruct"):
71
- api_url = f"https://api-inference.huggingface.co/models/{model_id}"
72
- headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
73
- payload = {"inputs": text}
74
 
75
- print(f"Querying...: {text}")
76
- response = requests.post(api_url, headers=headers, json=payload)
77
- return response.json()[0]["generated_text"][len(text) + 1 :]
78
 
79
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
80
 
81
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
82
 
83
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
84
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
85
 
86
- from datasets import load_dataset
87
 
88
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
89
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
90
 
91
- def synthesise(text):
92
- inputs = processor(text=text, return_tensors="pt")
93
- speech = model.generate_speech(
94
- inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
95
- )
96
- return speech.cpu()
97
 
98
 
99
- if __name__ == "__main__":
100
- launch_fn(debug=True)
101
- # transcription = transcribe()
102
- # response = query(transcription)
103
- # audio = synthesise(response)
104
 
105
- # Audio(audio, rate=16000, autoplay=True)
 
1
+ import gradio as gr
2
  from transformers import pipeline
3
+ import numpy as np
4
+
5
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
6
+
7
+ def transcribe(stream, new_chunk):
8
+ sr, y = new_chunk
9
+ y = y.astype(np.float32)
10
+ y /= np.max(np.abs(y))
11
+
12
+ if stream is not None:
13
+ stream = np.concatenate([stream, y])
14
+ else:
15
+ stream = y
16
+ return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
17
+
18
+ demo = gr.Interface(
19
+ transcribe,
20
+ ["state", gr.Audio(sources=["microphone"], streaming=True)],
21
+ ["state", "text"],
22
+ live=True,
23
+ )
24
 
25
+ demo.launch()
26
+ # from transformers import pipeline
27
+ # import torch
28
 
29
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
30
 
31
+ # classifier = pipeline(
32
+ # "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
33
+ # )
34
+
35
+ # from transformers.pipelines.audio_utils import ffmpeg_microphone_live
36
+
37
+
38
+ # def launch_fn(
39
+ # wake_word="marvin",
40
+ # prob_threshold=0.5,
41
+ # chunk_length_s=2.0,
42
+ # stream_chunk_s=0.25,
43
+ # debug=False,
44
+ # ):
45
+ # if wake_word not in classifier.model.config.label2id.keys():
46
+ # raise ValueError(
47
+ # f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
48
+ # )
49
+
50
+ # sampling_rate = classifier.feature_extractor.sampling_rate
51
+
52
+ # mic = ffmpeg_microphone_live(
53
+ # sampling_rate=sampling_rate,
54
+ # chunk_length_s=chunk_length_s,
55
+ # stream_chunk_s=stream_chunk_s,
56
+ # )
57
+
58
+ # print("Listening for wake word...")
59
+ # for prediction in classifier(mic):
60
+ # prediction = prediction[0]
61
+ # if debug:
62
+ # print(prediction)
63
+ # if prediction["label"] == wake_word:
64
+ # if prediction["score"] > prob_threshold:
65
+ # return True
66
+
67
+ # transcriber = pipeline(
68
+ # "automatic-speech-recognition", model="openai/whisper-base.en", device=device
69
+ # )
70
+ # import sys
71
 
72
 
73
+ # def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
74
+ # sampling_rate = transcriber.feature_extractor.sampling_rate
75
 
76
+ # mic = ffmpeg_microphone_live(
77
+ # sampling_rate=sampling_rate,
78
+ # chunk_length_s=chunk_length_s,
79
+ # stream_chunk_s=stream_chunk_s,
80
+ # )
81
 
82
+ # print("Start speaking...")
83
+ # for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
84
+ # sys.stdout.write("\033[K")
85
+ # print(item["text"], end="\r")
86
+ # if not item["partial"][0]:
87
+ # break
88
 
89
+ # return item["text"]
90
 
91
+ # from huggingface_hub import HfFolder
92
+ # import requests
93
 
94
 
95
+ # def query(text, model_id="tiiuae/falcon-7b-instruct"):
96
+ # api_url = f"https://api-inference.huggingface.co/models/{model_id}"
97
+ # headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
98
+ # payload = {"inputs": text}
99
 
100
+ # print(f"Querying...: {text}")
101
+ # response = requests.post(api_url, headers=headers, json=payload)
102
+ # return response.json()[0]["generated_text"][len(text) + 1 :]
103
 
104
+ # from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
105
 
106
+ # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
107
 
108
+ # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
109
+ # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
110
 
111
+ # from datasets import load_dataset
112
 
113
+ # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
114
+ # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
115
 
116
+ # def synthesise(text):
117
+ # inputs = processor(text=text, return_tensors="pt")
118
+ # speech = model.generate_speech(
119
+ # inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
120
+ # )
121
+ # return speech.cpu()
122
 
123
 
124
+ # if __name__ == "__main__":
125
+ # launch_fn(debug=True)
126
+ # # transcription = transcribe()
127
+ # # response = query(transcription)
128
+ # # audio = synthesise(response)
129
 
130
+ # # Audio(audio, rate=16000, autoplay=True)