Spaces:

ygauravyy
/

nanee-convo

Sleeping

App Files Files Community

ygauravyy commited on 23 days ago

Commit

fff6648

•

1 Parent(s): 27386e5

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -5

app.py CHANGED Viewed

@@ -1,7 +1,226 @@
-from fastapi import FastAPI
-app = FastAPI()
-@app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+import os
+import torch
+import argparse
+import gradio as gr
+import openai
+from zipfile import ZipFile
+import requests
+import se_extractor
+from api import BaseSpeakerTTS, ToneColorConverter
+import langid
+import traceback
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Function to download and extract checkpoints
+def download_and_extract_checkpoints():
+    zip_url = "https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip"
+    zip_path = "checkpoints.zip"
+    if not os.path.exists("checkpoints"):
+        print("Downloading checkpoints...")
+        response = requests.get(zip_url, stream=True)
+        with open(zip_path, "wb") as zip_file:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    zip_file.write(chunk)
+        print("Extracting checkpoints...")
+        with ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(".")
+        os.remove(zip_path)
+        print("Checkpoints are ready.")
+# Call the function to ensure checkpoints are available
+download_and_extract_checkpoints()
+# Initialize OpenAI API key
+openai.api_key = os.getenv("OPENAI_API_KEY")
+if not openai.api_key:
+    raise ValueError("Please set the OPENAI_API_KEY environment variable.")
+parser = argparse.ArgumentParser()
+parser.add_argument("--share", action='store_true', default=False, help="make link public")
+args = parser.parse_args()
+# Define paths to checkpoints
+en_ckpt_base = 'checkpoints/base_speakers/EN'
+zh_ckpt_base = 'checkpoints/base_speakers/ZH'
+ckpt_converter = 'checkpoints/converter'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+output_dir = 'outputs'
+os.makedirs(output_dir, exist_ok=True)
+# Load TTS models
+en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
+en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
+zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
+zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
+tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+# Load speaker embeddings
+en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
+en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
+zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
+# Extract speaker embedding from the default Mickey Mouse audio
+default_speaker_audio = "resources/output.wav"
+try:
+    target_se, _ = se_extractor.get_se(
+        default_speaker_audio,
+        tone_color_converter,
+        target_dir='processed',
+        vad=True
+    )
+    print("Speaker embedding extracted successfully.")
+except Exception as e:
+    raise RuntimeError(f"Failed to extract speaker embedding from {default_speaker_audio}: {str(e)}")
+# Supported languages
+supported_languages = ['zh', 'en']
+def predict(audio_file_pth, agree):
+    text_hint = ''
+    synthesized_audio_path = None
+    # Agree with the terms
+    if not agree:
+        text_hint += '[ERROR] Please accept the Terms & Conditions!\n'
+        return (text_hint, None)
+    # Check if audio file is provided
+    if audio_file_pth is not None:
+        speaker_wav = audio_file_pth
+    else:
+        text_hint += "[ERROR] Please record your voice using the Microphone.\n"
+        return (text_hint, None)
+    # Transcribe audio to text using OpenAI Whisper
+    try:
+        with open(speaker_wav, 'rb') as audio_file:
+            transcription_response = openai.Audio.transcribe(
+                model="whisper-1",
+                file=audio_file,
+                response_format='text'
+            )
+        input_text = transcription_response.strip()
+        print(f"Transcribed Text: {input_text}")
+    except Exception as e:
+        text_hint += f"[ERROR] Transcription failed: {str(e)}\n"
+        return (text_hint, None)
+    if len(input_text) == 0:
+        text_hint += "[ERROR] No speech detected in the audio.\n"
+        return (text_hint, None)
+    # Detect language
+    language_predicted = langid.classify(input_text)[0].strip()
+    print(f"Detected language: {language_predicted}")
+    if language_predicted not in supported_languages:
+        text_hint += f"[ERROR] The detected language '{language_predicted}' is not supported. Supported languages are: {supported_languages}\n"
+        return (text_hint, None)
+    # Select TTS model based on language
+    if language_predicted == "zh":
+        tts_model = zh_base_speaker_tts
+        language = 'Chinese'
+        speaker_style = 'default'
+    else:
+        tts_model = en_base_speaker_tts
+        language = 'English'
+        speaker_style = 'default'
+    # Generate response using OpenAI GPT-4
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are Mickey Mouse, a friendly and cheerful character who responds to children's queries in a simple and engaging manner. Please keep your response up to 200 characters."},
+                {"role": "user", "content": input_text}
+            ],
+            max_tokens=200,
+            temperature=0.7,
+        )
+        reply_text = response['choices'][0]['message']['content'].strip()
+        print(f"GPT-4 Reply: {reply_text}")
+    except Exception as e:
+        text_hint += f"[ERROR] Failed to get response from OpenAI GPT-4: {str(e)}\n"
+        return (text_hint, None)
+    # Synthesize reply text to audio
+    try:
+        src_path = os.path.join(output_dir, 'tmp_reply.wav')
+        tts_model.tts(reply_text, src_path, speaker=speaker_style, language=language)
+        print(f"Audio synthesized and saved to {src_path}")
+        save_path = os.path.join(output_dir, 'output_reply.wav')
+        tone_color_converter.convert(
+            audio_src_path=src_path,
+            src_se=en_source_default_se if language == 'English' else zh_source_se,
+            tgt_se=target_se,
+            output_path=save_path,
+            message="@MickeyMouse"
+        )
+        print(f"Tone color conversion completed and saved to {save_path}")
+        text_hint += "Response generated successfully.\n"
+        synthesized_audio_path = save_path
+    except Exception as e:
+        text_hint += f"[ERROR] Failed to synthesize audio: {str(e)}\n"
+        traceback.print_exc()
+        return (text_hint, None)
+    return (text_hint, synthesized_audio_path)
+with gr.Blocks(analytics_enabled=False) as demo:
+    gr.Markdown("# Mickey Mouse Voice Assistant")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                source="microphone",
+                type="filepath",
+                label="Record Your Voice",
+                info="Click the microphone button to record your voice."
+            )
+            tos_checkbox = gr.Checkbox(
+                label="Agree to Terms & Conditions",
+                value=False,
+                info="I agree to the terms of service."
+            )
+            submit_button = gr.Button("Send")
+        with gr.Column():
+            info_output = gr.Textbox(
+                label="Info",
+                interactive=False,
+                lines=4,
+            )
+            audio_output = gr.Audio(
+                label="Mickey's Response",
+                interactive=False,
+                autoplay=True,
+            )
+    submit_button.click(
+        predict,
+        inputs=[audio_input, tos_checkbox],
+        outputs=[info_output, audio_output]
+    )
+# Launch the Gradio app
+demo.queue()
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=int(os.environ.get("PORT", 7860)),
+    debug=True,
+    show_api=True,
+    share=False
+)