Spaces:

gundruke
/

ua-thesis-absa

Sleeping

File size: 4,688 Bytes

import gradio as gr
import torch
import json
from nltk.corpus import wordnet
from transformers import AutoConfig, AutoTokenizer
from models import BERTLstmCRF
from huggingface_hub import hf_hub_download
import os
import nltk

os.system("python -m nltk.downloader all")

checkpoint = "gundruke/bert-lstm-crf-absa"
config = AutoConfig.from_pretrained(checkpoint)
id2label = config.id2label

tokenizer = AutoTokenizer.from_pretrained("gundruke/bert-lstm-crf-absa")
model = BERTLstmCRF(config)


repo = "gundruke/bert-lstm-crf-absa"
filename = "pytorch_model.bin"
model.load_state_dict(torch.load(hf_hub_download(repo_id=repo, filename=filename), 
                                 map_location=torch.device('cpu')))

dictionary_file_path = hf_hub_download(repo_id=repo, filename="dictionary.json")

def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    tokenized_text = tokenizer(text)
    
    return tokens, tokenized_text


def convert_to_multilabel(label_list): 
    multilabel = []
    if "B-POS" in label_list or "I-POS" in label_list:
        multilabel.append("Positive")
    if "B-NEG" in label_list or "I-NEG" in label_list:
        multilabel.append("Negative")
    if "B-NEU" in label_list or "I-NEU" in label_list:
        multilabel.append("Neutral")

    return " and ".join(multilabel)


def classify_word(word, dictionary):
    synsets = wordnet.synsets(word)
    if synsets:
        hypernyms = synsets[0].hypernyms()  # Get the hypernym of the first synset
        if hypernyms:
            nltk_result = hypernyms[0].lemmas()[0].name()
        else:
            nltk_result = "Unknown"
    else:
        nltk_result = "Unknown"
    
    if word in dictionary:
        result = dictionary[word]
    elif nltk_result in ['atmosphere', 'drinks', 'food', 'price', 'service']:
        result = nltk_result
    else:
        result = 'other'

    return result, nltk_result


def get_outputs(tokenized_text):
    input_ids = tokenized_text["input_ids"]
    token_type_ids = tokenized_text["token_type_ids"]
    attention_mask = tokenized_text["attention_mask"]
    
    inputs = {
        'input_ids': torch.tensor([input_ids]),
        'token_type_ids': torch.tensor([token_type_ids]),
        'attention_mask': torch.tensor([attention_mask])
    }
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    labels = [id2label.get(i) for i in torch.flatten(outputs[1]).tolist()][1:-1]
    
    return labels


def join_wordpieces(tokens, labels):
    joined_tokens =  []
    
    for token, label in zip(tokens, labels):
        if label == "O":
            label = None
        if token.startswith("##"):
            last_token = joined_tokens[-1][0]
            joined_tokens[-1] = (last_token+token[2:], label)
        else:
            joined_tokens.append((token, label))

    return joined_tokens


def get_category(word, dict_file):
    with open(dict_file, "r") as file:
        dictionary = json.load(file)
    
    r, n = classify_word(word, dictionary)
    
    return r


def text_analysis(text):
    tokens, tokenized_text = tokenize_text(text)
    labels = get_outputs(tokenized_text)
    multilabel = convert_to_multilabel(labels)
    
    token_tuple = join_wordpieces(tokens, labels)    
    tokenized_text["tokens"] = tokens
    
    categories = []
    for tok in token_tuple:
        if tok[1]:
            categories.append((tok[0], get_category(tok[0], dictionary_file_path)))
        else:
            categories.append((tok[0], None))
    
    
    
    
    return token_tuple, multilabel, categories


theme = gr.themes.Base()
with gr.Blocks(theme=theme) as demo:
    with gr.Column():
        input_textbox = gr.Textbox(placeholder="Enter sentence here...")
        btn = gr.Button("Submit", variant="primary")

        btn.click(fn=text_analysis,
                  inputs=input_textbox,
                    outputs=[gr.HighlightedText(label="Token labels"),
                             gr.Label(label="Multilabel classification"),
                             gr.HighlightedText(label="Category")],
                    queue=False)
    
    with gr.Column():
        examples=[
            ["I've been coming here as a child and always come back for the taste."],
            ["The tea is great and all the sweets are homemade."],
            ["Strong build which really adds to its durability but poor battery life."],
            ["We loved the recommendation for the wine, and I think the eggplant parmigiana appetizer should become an entree."],
            ["chicken pasta was tasty, wine was super nice but waiter was rude."]
            ]
        gr.Examples(examples, input_textbox)

demo.launch(debug=True)