File size: 3,182 Bytes
06dd398
 
 
 
 
49e8fad
6dc7b04
516b487
6c8d2ca
 
 
10ee224
6c8d2ca
06dd398
6c8d2ca
6dc7b04
 
6c8d2ca
6dc7b04
 
6c8d2ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60199c
6c8d2ca
 
251af67
6c8d2ca
 
 
23e22b5
6c8d2ca
 
 
 
 
d60199c
6c8d2ca
 
 
461e4d3
6c8d2ca
 
 
 
 
 
d60199c
6c8d2ca
 
 
23e22b5
06dd398
6c8d2ca
516b487
6c8d2ca
 
06dd398
6c8d2ca
06dd398
6dc7b04
b39fefc
06dd398
 
 
 
1b123c7
06dd398
516b487
06dd398
 
49e8fad
6c8d2ca
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import base64
import numpy as np
from scipy.io import wavfile
from voice_processing import parallel_tts, get_model_names
import os
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

async def convert_tts(model_name, audio_file, slang_rate):
    try:
        logger.debug(f"Received request - model: {model_name}, audio: {type(audio_file)}, slang: {slang_rate}")
        
        if audio_file is None:
            logger.error("No audio file provided")
            return {"error": "No audio file uploaded."}, None

        # Log the audio file details
        if hasattr(audio_file, 'name'):
            logger.debug(f"Audio file name: {audio_file.name}")
        logger.debug(f"Audio file type: {type(audio_file)}")
        
        # Create task for parallel processing
        task = (model_name, None, None, slang_rate, True, audio_file)
        
        # Process the audio
        logger.debug("Starting audio processing")
        result = parallel_tts([task])
        logger.debug(f"Processing result: {type(result)}")
        
        if not result or result[0] is None:
            logger.error("Processing failed - no result")
            return {"error": "Processing failed"}, None
            
        info, _, (tgt_sr, audio_output) = result[0]
        logger.debug(f"Processing complete - info: {info}")

        if audio_output is None:
            logger.error("No audio output generated")
            return {"error": "No audio output generated"}, None

        # Save the output
        try:
            output_filename = f"output_{os.urandom(4).hex()}.wav"
            output_path = os.path.join("outputs", output_filename)
            os.makedirs("outputs", exist_ok=True)

            if isinstance(audio_output, np.ndarray):
                logger.debug(f"Saving numpy array with shape {audio_output.shape}")
                wavfile.write(output_path, tgt_sr, audio_output)
            else:
                logger.debug(f"Saving raw audio data of type {type(audio_output)}")
                with open(output_path, "wb") as f:
                    f.write(audio_output)
            
            logger.debug(f"Successfully saved to {output_path}")
            return {"info": info}, output_path

        except Exception as save_error:
            logger.error(f"Error saving output: {save_error}", exc_info=True)
            return {"error": f"Error saving output: {str(save_error)}"}, None

    except Exception as e:
        logger.error(f"Error in convert_tts: {str(e)}", exc_info=True)
        return {"error": str(e)}, None

# Interface definition
iface = gr.Interface(
    fn=convert_tts,
    inputs=[
        gr.Dropdown(choices=get_model_names(), label="Model", interactive=True),
        gr.Audio(label="Upload Audio", type="filepath"),
        gr.Slider(minimum=0, maximum=1, step=0.01, label="Slang Rate"),
    ],
    outputs=[
        gr.JSON(label="Info"),
        gr.Audio(label="Converted Audio")
    ],
    title="Voice Conversion"
).queue()

if __name__ == "__main__":
    iface.launch(
        debug=True,
        show_error=True,
        max_threads=10
    )