File size: 4,508 Bytes
4024bc6
 
 
 
85e8a86
 
 
4024bc6
 
 
 
 
 
 
 
4058233
4024bc6
4058233
 
 
 
 
 
 
 
 
 
4024bc6
 
 
 
 
4058233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4024bc6
 
4058233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4024bc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85e8a86
 
 
4024bc6
 
 
 
 
 
 
d99ff1c
 
4024bc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4058233
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
import torch
import numpy as np
import pyaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from gtts import gTTS
import os

class VoiceAssistant:
    def __init__(self):
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
        
        self.sample_rate = 16000
        self.chunk_size = 480
        
        self.p = pyaudio.PyAudio()
        self.input_device_index = self.select_input_device()
        
        self.stream = self.p.open(
            format=pyaudio.paFloat32, 
            channels=1, 
            rate=self.sample_rate, 
            input=True, 
            input_device_index=self.input_device_index,
            frames_per_buffer=self.chunk_size
        )
        
        self.keyword_activation = "jarvis"
        self.keyword_deactivation = "detente"
        
        self.listening = False
    
    def select_input_device(self):
        for i in range(self.p.get_device_count()):
            dev = self.p.get_device_info_by_index(i)
            if dev['maxInputChannels'] > 0:
                print(f"Dispositivo {i}: {dev['name']}")
        
        for i in range(self.p.get_device_count()):
            dev = self.p.get_device_info_by_index(i)
            if dev['maxInputChannels'] > 0:
                try:
                    test_stream = self.p.open(
                        format=pyaudio.paFloat32, 
                        channels=1, 
                        rate=self.sample_rate, 
                        input=True, 
                        input_device_index=i,
                        frames_per_buffer=self.chunk_size
                    )
                    test_stream.close()
                    return i
                except Exception:
                    continue
        
        raise RuntimeError("No input device found")
    
    def vad_collector(self):
        audio_chunks, keyword_detected = [], False
        while self.listening:
            try:
                data = self.stream.read(self.chunk_size)
                audio_chunk = np.frombuffer(data, dtype=np.float32)
                
                if self.keyword_activation.lower() in str(audio_chunk).lower():
                    keyword_detected = True
                    break
                
                if self.keyword_deactivation.lower() in str(audio_chunk).lower():
                    self.listening = False
                    break
                
                audio_chunks.append(audio_chunk)
            except Exception as e:
                st.error(f"Audio capture error: {e}")
                break
        
        return audio_chunks, keyword_detected
    
    def transcribe_audio(self, audio_chunks):
        audio_data = np.concatenate(audio_chunks)
        
        input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
        with torch.no_grad():
            logits = self.model(input_values).logits
        
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.decode(predicted_ids[0])
        
        return transcription
    
    def generate_response(self, text):
        return "Respuesta generada para: " + text
    
    def text_to_speech(self, text):
        tts = gTTS(text=text, lang='es')
        output_path = "response.mp3"
        tts.save(output_path)
        return output_path
    
    def run(self):
        st.title("Asistente de Voz JARVIS")
        
        if st.button("Iniciar/Detener Escucha"):
            self.listening = not self.listening
            st.write("Escucha activada." if self.listening else "Escucha desactivada.")
        
        if self.listening:
            audio_chunks, keyword_detected = self.vad_collector()
            
            if keyword_detected:
                st.success("Palabra clave 'JARVIS' detectada. Procesando...")
                transcribed_text = self.transcribe_audio(audio_chunks)
                st.write(f"Texto transcrito: {transcribed_text}")
                
                response = self.generate_response(transcribed_text)
                st.write(f"Respuesta: {response}")
                
                audio_path = self.text_to_speech(response)
                st.audio(audio_path)

def main():
    assistant = VoiceAssistant()
    assistant.run()

if __name__ == "__main__":
    main()