import gradio as gr import tensorflow as tf from transformers import pipeline inception_net = tf.keras.applications.MobileNetV2() def classify_imagen(inp): inp = inp.reshape((-1, 224, 224, 3)) inp = tf.keras.applications.mobilenet_v2.preprocess_input(inp) prediction = inception_net.predict(inp).reshape(1,1000) pred_scores = tf.keras.applications.mobilenet_v2.decode_predictions(prediction, top=100) confidence = {f'{pred_scores[0][i][1]}': float(pred_scores[0][i][2]) for i in range(100)} return confidence def audio2text(audio): text = trans(audio)["text"] return text def text2sentiment(text): return classificator(text)[0]['label'] trans = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-spanish") classificator = pipeline("text-classification", model="pysentimiento/robertuito-sentiment-analysis") demo = gr.Blocks() with demo: gr.Markdown("# Multimodal Demo") with gr.Tabs(): with gr.TabItem("Transcribe Audio en espaƱol"): with gr.Row(): audio = gr.Audio(source='microphone', type='filepath') transcript = gr.Textbox() b1 = gr.Button("Transcribe") with gr.TabItem("Analisis de sentimiento"): with gr.Row(): texto = gr.Textbox() label = gr.Label() b2 = gr.Button("Sentimiento") b1.click(audio2text, inputs=audio, outputs=transcript) b2.click(text2sentiment, inputs=texto, outputs=label) with gr.TabItem("Clasificador de imagenes"): with gr.Row(): image = gr.Image(shape=(224, 224)) label= gr.Label(num_top_classes=3) bimage= gr.Button("Clasifica") bimage.click(classify_imagen, inputs=image, outputs=label) if __name__ == '__main__': demo.launch()