Spaces:

mskov
/

Speech-Trigger-Detection

Runtime error

File size: 4,239 Bytes

babca6f
 
cbe4d4c
 
c8e54ed
1ae8e53
df85058
585a1e8
53eb88c
 
 
 
 
 
 
28ff844
df85058
a94b06f
df85058
 
 
 
 
 
61fa7d4
 
 
34bf2a6
61fa7d4
 
 
 
4b9eea9
df85058
c8e54ed
53eb88c
c8e54ed
bbd3701
2cadcf2
f5e59d1
 
 
9d36990
f5e59d1
f10b2fa
6bfef5d
c8e54ed
73d041b
 
e95ab8a
b65fb2a
1ff03d5
c8e54ed
 
 
1ff03d5
53eb88c
73d041b
53eb88c
 
 
40948af
 
53eb88c
9c9e849
61fa7d4
73d041b
6666837
53eb88c
 
73d041b
 
df85058
f8fa917
df85058
 
2724e1c
c8e54ed
33b1b5b
53eb88c
61fa7d4
33b1b5b
ca7ae8f
 
335e90e
 
33b1b5b
53eb88c
30dbd25
c8e54ed

import os
os.system("pip install git+https://github.com/openai/whisper.git")
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
from speechbrain.pretrained.interfaces import foreign_class
from transformers import AutoModelForSequenceClassification, pipeline, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer
# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

# Building prediction function for gradio
emo_dict = {
    'sad': 'Sad', 
    'hap': 'Happy',
    'ang': 'Anger',
    'neu': 'Neutral'
}

# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
class_options = {
    "racism": ["racism", "hate speech", "bigotry", "racially targeted", "racially diminutive", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"],
    "LGBTQ+ hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+", "hate speech"],
    "sexually explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexually demeaning", "sexual violence", "victim blaming"],
    "misophonia": ["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"]
}

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, text_input, classify_anxiety):
    # Transcribe the audio file using Whisper ASR
    if audio_file != None:
        transcribed_text = pipe(audio_file)["text"]
        
        #### Emotion classification ####
        emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
        out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file)
    
    else:
        transcribed_text = text_input
 
    #### Toxicity Classifier ####
        
    toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
    #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")

    toxicity_results = toxicity_module.compute(predictions=[transcribed_text])
 
    toxicity_score = toxicity_results["toxicity"][0]
    print(toxicity_score)

    #### Text classification #####

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

    sequence_to_classify = transcribed_text
    print(classify_anxiety, class_options)
    candidate_labels = class_options.get(classify_anxiety, [])
    # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
    classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True)
    print(classification_output)

    #### Emotion classification ####
    
    emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
    out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file)
 
    return toxicity_score, classification_output, emo_dict[text_lab[0]], transcribed_text
    # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
 
with gr.Blocks() as iface:
    with gr.Column():
        classify = gr.Radio(["racism", "LGBTQ+ hate", "sexually explicit", "misophonia"])
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        text = gr.Textbox(label="Enter Text", placeholder="Enter text here...")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        out_text = gr.Textbox()
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, text, classify], outputs=out_text)

iface.launch()