File size: 3,452 Bytes
a03ccae
183e615
a03ccae
 
183e615
a03ccae
76f7d8c
 
e2c2451
a03ccae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8dc62d
a03ccae
 
 
 
 
65692a6
a03ccae
 
 
bed9a71
 
 
 
 
431fd43
bed9a71
431fd43
 
bed9a71
 
 
 
 
 
431fd43
a03ccae
 
6bac00a
 
 
1e0a73d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import openai
import requests
import json
import os

openai.api_key = os.environ.get('OPENAI_API_KEY')

    
messages = [{"role": "system", "content": 'You are Steve Jobs. Respond to all input in 25 words or less.'}]

# Set up the API endpoint URL and headers
url = f"https://api.elevenlabs.io/v1/text-to-speech/{os.environ.get('voice_id')}/stream"
headers = {
    "accept": "*/*",
    "xi-api-key": os.environ.get('elevenlabs_api_key'),
    "Content-Type": "application/json",
}

# Define a function to handle the Gradio input and generate the response
def transcribe(audio):
    global messages

    # Use OpenAI to transcribe the user's audio input
    # API call 1
    audio_file = open(audio, "rb")
    transcript = openai.Audio.transcribe("whisper-1", audio_file)

    # Append the user's message to the message history
    messages.append({"role": "user", "content": transcript["text"]})

    # Generate a response using OpenAI's chat API
    #API call 2
    response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)

    # Extract the system message from the API response and append it to the message history
    system_message = response["choices"][0]["message"]
    messages.append(system_message)
    
    
    #API Call 3
    # Use the voice synthesis API to generate an audio response from the system message
    data = {
        "text": system_message["content"],
        "voice_settings": {
            "stability": 0,
            "similarity_boost": 0
        }
    }
    response = requests.post(url, headers=headers, data=json.dumps(data), stream=True)

    # Save the audio response to a file
    if response.ok:
        with open("output.wav", "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                f.write(chunk)
    else:
        print(f"Error: {response.status_code} - {response.reason}")
        
    # IPython.display.display(IPython.display.Audio('output.wav'))

    # Generate a chat transcript for display in the Gradio UI
    chat_transcript = ""
    for message in messages:
        if message['role'] != 'system':
            chat_transcript += message['role'] + ": " + message['content'] + "\n\n"

    return chat_transcript,'output.wav'

# css = """
#       #col-container {max-width: 80%; margin-left: auto; margin-right: auto;}
#       #header {text-align: center;}
#         }
#         """

# with gr.Blocks(css=css) as ui:
    
    
#     with gr.Column(elem_id="col-container"):
#         gr.Markdown("""## Talk to AI Steve Jobs: Audio-to-Text+Audio generation
#                     Powered by ChatGPT + Whisper + ElevenLabs + HuggingFace <br>
#                     <br>
#                     """,
#                     elem_id="header")

# Define the Gradio UI interface
# ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text")
ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs=['text','audio'],title='Talk to AI Steve Jobs', description = """Click on Record from microphone and start speaking, 
and when you're done, click on Stop Recording. Then click on Submit. AI Steve will then answer your question. You can continue to ask follow-up questions by clicking on Clear, and then 
using Record from microphone -> Stop Recording -> Submit  AI Steve Jobs will also remember the previous questions and answers.""")
ui.launch(debug=True)