KingNish commited on
Commit
59ab711
·
verified ·
1 Parent(s): 795e26d

Update voice_chat.py

Browse files
Files changed (1) hide show
  1. voice_chat.py +49 -51
voice_chat.py CHANGED
@@ -1,31 +1,25 @@
1
- import os
 
2
  import asyncio
3
  import tempfile
4
- import random
5
-
6
- import edge_tts
7
- from streaming_stt_nemo import Model as nemo
8
- import gradio as gr
9
- from transformers import pipeline
10
- from transformers import AutoTokenizer, AutoModelForCausalLM
11
- from transformers import AutoModel
12
- from huggingface_hub import InferenceClient
13
  import torch
 
 
 
14
 
15
- # Set default language for speech recognition
16
- default_lang = "en"
17
- # Initialize speech recognition engine
18
- engines = {default_lang: nemo(default_lang)}
19
 
20
- # Load pre-trained models for language modeling
21
- model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
 
 
22
 
23
- # Define a function for speech-to-text transcription
24
- def transcribe(audio):
25
- lang = "en"
26
- model = engines[lang]
27
- text = model.stt_file(audio)[0]
28
- return text
29
 
30
  # Get Hugging Face API token
31
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -46,41 +40,45 @@ def client_fn(model):
46
  else:
47
  return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
48
 
 
49
 
50
- # Define a function to generate a random seed
51
- def randomize_seed_fn(seed: int) -> int:
52
- seed = random.randint(0, 999999)
53
- return seed
54
 
55
- # System instructions for the language model
56
- system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
57
 
58
- # Define a function for language modeling
59
- def models(text, model="Mixtral 8x7B", seed=42):
60
- seed = int(randomize_seed_fn(seed))
61
- generator = torch.Generator().manual_seed(seed)
62
- client = client_fn(model)
63
- generate_kwargs = dict(
64
- max_new_tokens=512,
65
- seed=seed,
66
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
68
- stream = client.text_generation(
69
- formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False
70
- )
71
- output = ""
72
- for response in stream:
73
- if not response.token.text == "</s>":
74
- output += response.token.text
75
- return output
76
-
77
- # Define an asynchronous function to handle voice input and generate responses
78
- async def respond(audio, model, seed):
79
  user = transcribe(audio)
80
- reply = models(user, model, seed)
81
  communicate = edge_tts.Communicate(reply)
82
- # Save the generated speech to a temporary file
83
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
84
  tmp_path = tmp_file.name
85
  await communicate.save(tmp_path)
86
- yield tmp_path
 
1
+ import gradio as gr
2
+ import edge_tts
3
  import asyncio
4
  import tempfile
5
+ import numpy as np
6
+ import soxr
7
+ from pydub import AudioSegment
 
 
 
 
 
 
8
  import torch
9
+ import sentencepiece as spm
10
+ import onnxruntime as ort
11
+ from huggingface_hub import hf_hub_download, InferenceClient
12
 
13
+ # Speech Recognition Model Configuration
14
+ model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
15
+ sample_rate = 16000
 
16
 
17
+ # Download preprocessor, encoder and tokenizer
18
+ preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
19
+ encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
20
+ tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
21
 
22
+ # Model Configuration
 
 
 
 
 
23
 
24
  # Get Hugging Face API token
25
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
40
  else:
41
  return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
42
 
43
+ system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
44
 
45
+ def resample(audio_fp32, sr):
46
+ return soxr.resample(audio_fp32, sr, sample_rate)
 
 
47
 
48
+ def to_float32(audio_buffer):
49
+ return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
50
 
51
+ def transcribe(audio_path):
52
+ audio_file = AudioSegment.from_file(audio_path)
53
+ sr = audio_file.frame_rate
54
+ audio_buffer = np.array(audio_file.get_array_of_samples())
55
+
56
+ audio_fp32 = to_float32(audio_buffer)
57
+ audio_16k = resample(audio_fp32, sr)
58
+
59
+ input_signal = torch.tensor(audio_16k).unsqueeze(0)
60
+ length = torch.tensor(len(audio_16k)).unsqueeze(0)
61
+ processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
62
+
63
+ logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]
64
+
65
+ blank_id = tokenizer.vocab_size()
66
+ decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
67
+ text = tokenizer.decode_ids(decoded_prediction)
68
+
69
+ return text
70
+
71
+ def model(text, model="Mixtral 8x7B"):
72
+ client1 = client_fn(model)
73
  formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
74
+ stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
75
+ return "".join([response.token.text for response in stream if response.token.text != "</s>"])
76
+
77
+ async def respond(audio, model):
 
 
 
 
 
 
 
78
  user = transcribe(audio)
79
+ reply = model(user, model)
80
  communicate = edge_tts.Communicate(reply)
 
81
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
82
  tmp_path = tmp_file.name
83
  await communicate.save(tmp_path)
84
+ return tmp_path