Spaces:

awacke1
/

CSV2ClassifyVisualization

Runtime error

File size: 4,706 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from transformers import AutoTokenizer,AutoModelForSequenceClassification, Trainer

def linkifyHTML():
  import pandas as pd
  import streamlit as st
  link1 = "https://stackoverflow.com/questions/71641666/hyperlink-in-streamlit-dataframe"
  link2 = "https://stackoverflow.com/questions/71731937/how-to-plot-comparison-in-streamlit-dynamically-with-multiselect"
  df = pd.DataFrame(
    {
        "url": [
            f'<a target="_blank" href="{link1}">Hyperlink in Streamlit dataframe</a>',
            f'<a target="_blank" href="{link2}">How to plot comparison in Streamlit dynamically with multiselect?</a>'
        ],
        "label": ["question", "question"]
    }
  )
  doc=df.to_html(escape=False, index=False)
  html = displacy.render(doc, style="dep", page=True)
  return html


# summary function - test for single gradio function interfrace
def bulk_function(filename):
  # Create class for data preparation
  class SimpleDataset:
      def __init__(self, tokenized_texts):
          self.tokenized_texts = tokenized_texts
      
      def __len__(self):
          return len(self.tokenized_texts["input_ids"])
      
      def __getitem__(self, idx):
          return {k: v[idx] for k, v in self.tokenized_texts.items()}

  html = linkify()
  gradio.HTML(html)
  
  # load tokenizer and model, create trainer
  model_name = "j-hartmann/emotion-english-distilroberta-base"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  trainer = Trainer(model=model)  
  print(filename, type(filename))
  print(filename.name)

  # check type of input file
  if filename.name.split(".")[1] == "csv":
    print("entered")
    # read file, drop index if exists
    df_input = pd.read_csv(filename.name, index_col=False)
    if df_input.columns[0] == "Unnamed: 0":
      df_input = df_input.drop("Unnamed: 0", axis=1)
  elif filename.name.split(".")[1] == "xlsx":
    df_input = pd.read_excel(filename.name, index_col=False)
    # handle Unnamed
    if df_input.columns[0] == "Unnamed: 0":
      df_input = df_input.drop("Unnamed: 0", axis=1)
  else:
    return

  # expect csv format to be in: 
  # 1: ID
  # 2: Texts
  # no index
  # store ids in ordered list
  ids = df_input[df_input.columns[0]].to_list()

  # store sentences in ordered list
  # expects sentences to be in second col
  # of csv with two cols
  lines_s = df_input[df_input.columns[1]].to_list()

    # Tokenize texts and create prediction data set
  tokenized_texts = tokenizer(lines_s,truncation=True,padding=True)
  pred_dataset = SimpleDataset(tokenized_texts)

    # Run predictions -> predict whole df
  predictions = trainer.predict(pred_dataset)

    # Transform predictions to labels
  preds = predictions.predictions.argmax(-1)
  labels = pd.Series(preds).map(model.config.id2label)
  scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

  # round scores
  scores_rounded = [round(score, 3) for score in scores]

    # scores raw
  temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

  # container
  anger = []
  disgust = []
  fear = []
  joy = []
  neutral = []
  sadness = []
  surprise = []

  # extract scores (as many entries as exist in pred_texts)
  for i in range(len(lines_s)):
    anger.append(round(temp[i][0], 3))
    disgust.append(round(temp[i][1], 3))
    fear.append(round(temp[i][2], 3))
    joy.append(round(temp[i][3], 3))
    neutral.append(round(temp[i][4], 3))
    sadness.append(round(temp[i][5], 3))
    surprise.append(round(temp[i][6], 3))

  # define df
  df = pd.DataFrame(list(zip(ids,lines_s,labels,scores_rounded, anger, disgust, fear, joy, neutral, sadness, surprise)), columns=[df_input.columns[0], df_input.columns[1],'max_label','max_score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
  print(df)
  # save results to csv
  YOUR_FILENAME = filename.name.split(".")[0] + "_emotion_predictions" + ".csv"  # name your output file
  df.to_csv(YOUR_FILENAME, index=False)

  # return dataframe for space output
  return YOUR_FILENAME
  
gr.Interface(
  bulk_function, 
  inputs=[gr.inputs.File(file_count="single",
    type="file", 
    label="Upload file", 
    optional=False),
  ],
  outputs=[gr.outputs.File(label="Output file")],
    theme="huggingface",
    title="CSV File to Sentence Emotion Classification",
    description="Upload csv file with 2 columns (in order): (a) ID column, (b) text column. Model: https://huggingface.co/j-hartmann/emotion-english-distilroberta-base.",
    allow_flagging=False,
).launch(debug=True)