File size: 4,100 Bytes
f7f39bd
3200ea6
f7f39bd
 
 
 
 
 
 
 
 
 
 
 
 
715b732
f7f39bd
 
 
 
715b732
f7f39bd
 
3200ea6
f7f39bd
 
 
3200ea6
f7f39bd
3200ea6
f7f39bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3200ea6
 
f7f39bd
 
3200ea6
f7f39bd
 
 
 
 
3200ea6
f7f39bd
 
 
 
 
 
3200ea6
f7f39bd
3200ea6
f7f39bd
 
3200ea6
 
f7f39bd
 
 
 
3200ea6
f7f39bd
 
 
3200ea6
f7f39bd
3200ea6
f7f39bd
3200ea6
f7f39bd
 
3200ea6
f7f39bd
3200ea6
f7f39bd
 
3200ea6
f7f39bd
 
 
 
 
 
96ff1c2
 
f7f39bd
 
 
 
 
3200ea6
f7f39bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

def transcribe(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] , new_chunk 

demo = gr.Interface(
    transcribe,
    ["state", gr.Audio(sources=["microphone"], streaming=True)],
    ["state", "text", gr.Audio(label="Output", streaming=True, autoplay=True)],
    live=True,
)

demo.launch()
# from transformers import pipeline
# import torch

# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# classifier = pipeline(
#     "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
# )

# from transformers.pipelines.audio_utils import ffmpeg_microphone_live


# def launch_fn(
#     wake_word="marvin",
#     prob_threshold=0.5,
#     chunk_length_s=2.0,
#     stream_chunk_s=0.25,
#     debug=False,
# ):
#     if wake_word not in classifier.model.config.label2id.keys():
#         raise ValueError(
#             f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
#         )

#     sampling_rate = classifier.feature_extractor.sampling_rate

#     mic = ffmpeg_microphone_live(
#         sampling_rate=sampling_rate,
#         chunk_length_s=chunk_length_s,
#         stream_chunk_s=stream_chunk_s,
#     )

#     print("Listening for wake word...")
#     for prediction in classifier(mic):
#         prediction = prediction[0]
#         if debug:
#             print(prediction)
#         if prediction["label"] == wake_word:
#             if prediction["score"] > prob_threshold:
#                 return True

# transcriber = pipeline(
#     "automatic-speech-recognition", model="openai/whisper-base.en", device=device
# )
# import sys


# def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
#     sampling_rate = transcriber.feature_extractor.sampling_rate

#     mic = ffmpeg_microphone_live(
#         sampling_rate=sampling_rate,
#         chunk_length_s=chunk_length_s,
#         stream_chunk_s=stream_chunk_s,
#     )

#     print("Start speaking...")
#     for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
#         sys.stdout.write("\033[K")
#         print(item["text"], end="\r")
#         if not item["partial"][0]:
#             break

#     return item["text"]

# from huggingface_hub import HfFolder
# import requests


# def query(text, model_id="tiiuae/falcon-7b-instruct"):
#     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
#     headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
#     payload = {"inputs": text}

#     print(f"Querying...: {text}")
#     response = requests.post(api_url, headers=headers, json=payload)
#     return response.json()[0]["generated_text"][len(text) + 1 :]

# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# from datasets import load_dataset

# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# def synthesise(text):
#     inputs = processor(text=text, return_tensors="pt")
#     speech = model.generate_speech(
#         inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
#     )
#     return speech.cpu()


# if __name__ == "__main__":
#     launch_fn(debug=True)
#     # transcription = transcribe()
#     # response = query(transcription)
#     # audio = synthesise(response)
    
#     # Audio(audio, rate=16000, autoplay=True)