Audio Sentiment Analysis

import gradio as gr
import whisper
from transformers import pipeline


model = whisper.load_model("base")
sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")

def analyze_sentiment(text):
    results = sentiment_analysis(text)
    sentiment_results = {result['label']: result['score'] for result in results}
    return sentiment_results

def get_sentiment_emoji(sentiment):
    # Define the emojis corresponding to each sentiment
    emoji_mapping = {
        "disappointment": "😞",
        "sadness": "😢",
        "annoyance": "😠",
        "neutral": "😐",
        "disapproval": "👎",
        "realization": "😮",
        "nervousness": "😬",
        "approval": "👍",
        "joy": "😄",
        "anger": "😡",
        "embarrassment": "😳",
        "caring": "🤗",
        "remorse": "😔",
        "disgust": "🤢",
        "grief": "😥",
        "confusion": "😕",
        "relief": "😌",
        "desire": "😍",
        "admiration": "😌",
        "optimism": "😊",
        "fear": "😨",
        "love": "❤️",
        "excitement": "🎉",
        "curiosity": "🤔",
        "amusement": "😄",
        "surprise": "😲",
        "gratitude": "🙏",
        "pride": "🦁"
    }
    return emoji_mapping.get(sentiment, "")

def display_sentiment_results(sentiment_results, option):
    sentiment_text = ""
    for sentiment, score in sentiment_results.items():
        emoji = get_sentiment_emoji(sentiment)
        if option == "Sentiment Only":
            sentiment_text += f"{sentiment} {emoji}\n"
        elif option == "Sentiment + Score":
            sentiment_text += f"{sentiment} {emoji}: {score}\n"
    return sentiment_text

def inference(audio, sentiment_option):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)
    lang = max(probs, key=probs.get)

    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(model, mel, options)

    sentiment_results = analyze_sentiment(result.text)
    sentiment_output = display_sentiment_results(sentiment_results, sentiment_option)

    return lang.upper(), result.text, sentiment_output

title = """<h1 align="center">Audio Sentiment Analysis</h1>"""
subtitle = """<h6 align="center">Automatic Speech Recognition</h6>"""
image_path = "Arquitecture_W.jpg"
description = """
<p align="justify">With cross-modal interaction and AI (tools and pre-trained models in NLP), we can analyze large audio data
in real-time, such as recorded conversations, customer service calls, or voice recordings, in order to identify and categorize
emotions (from positive and neutral to sad and angry.</p><br>

Components of the tool:<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Input: Real-time multilingual<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Video Call speech recognition<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Pre-trained model: Whisper<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Model size: Large with 769M Parameters<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Encoder/Decoder Arquitecture <br>
&nbsp;&nbsp;&nbsp;&nbsp; - Transcribe, Translate, and Identify Audio<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Output: Sentiment analysis<br>
<br>
"""

custom_css = """
banner-image {
    margin-left: auto;
    margin-right: auto;
}
chat-message {
    font-size: 300px;
    min-height: 600px;
}

img {
  border-radius: 8px;
  max-width: 100%;
  height: auto;
}

"""


block = gr.Blocks(css=custom_css, theme='gradio/default',title="Analytics Projects by Ray Espinoza")
#block = gr.Blocks(css=custom_css, title="Analytics Projects by Ray Espinoza")
#block = gr.Blocks(css=".gradio-container {background-color: black}", title="Analytics Projects by Ray Espinoza")
#block = gr.Blocks(css=".gradio-container {background: url('file=pic4.jpg')}", title="Analytics Projects by Ray Espinoza")

with block:
    gr.HTML(title)
    gr.HTML(subtitle)

    with gr.Row():
        with gr.Column(scale=2):
            gr.Image(image_path, elem_id="banner-image", show_label=False, show_download_button=False)
            #banner-image
            #gr.Markdown(value=image_path, elem_id="img")
            #gr.Image(image_path, elem_id="chat-message", show_label=False)
        with gr.Column():
            gr.HTML(description)

    with gr.Group():
        with gr.Box():
            audio = gr.Audio(
                label="Input Audio",
                show_label=False,#Here#False
                source="microphone",
                type="filepath"
            )

            sentiment_option = gr.Radio(
                choices=["Sentiment Only", "Sentiment + Score"],
                label="Select an option",
                default="Sentiment Only"
            )

            btn = gr.Button("Execute: Transcribe",variant="primary")

        lang_str = gr.Textbox(label="Language:")

        text = gr.Textbox(label="Transcription:")

        sentiment_output = gr.Textbox(label="Sentiment Analysis Results:", output=True)

        btn.click(inference, inputs=[audio, sentiment_option], outputs=[lang_str, text, sentiment_output])

        gr.HTML('''
        <div class="footer">
            <p>By <a href="https://github.com/rayespinozah" style="text-decoration: underline;" target="_blank"> Ray Espinoza Github</a>
            </p>
        </div>
        ''')

block.launch(share=True)