Spaces:

musfiqdehan
/

Bangla-PoS-Taggers

Runtime error

File size: 4,791 Bytes

407b426

"""
This module contains the functions to get PoS tags using Spacy and return a Markdown table
"""

from .alignment_mappers import get_alignment_mapping

from flair.models import SequenceTagger
from flair.data import Sentence

import spacy
from spacy.cli import download
download("en_core_web_sm")
import en_core_web_sm

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from textblob import TextBlob


def get_spacy_postag_dict(target=""):
    ''' 
    Get spacy pos tags 
    '''
    nlp = en_core_web_sm.load()
    target_tokenized = nlp(target)
    spacy_postag_dict = dict((token.text, token.tag_)
                             for token in target_tokenized)
    return spacy_postag_dict

def get_nltk_postag_dict(target=""):
    ''' 
    Get nltk pos tags 
    '''
    target_tokenized = nltk.tokenize.word_tokenize(target)
    nltk_postag_dict = dict((key, value)
                            for key, value in nltk.pos_tag(target_tokenized))
    return nltk_postag_dict

def get_flair_postag_dict(target=""):
    ''' 
    Get flair pos tags 
    '''
    tagger = SequenceTagger.load("pos")
    target_tokenized = Sentence(target)
    tagger.predict(target_tokenized)
    flair_postag_dict = dict((token.text, token.tag)
                             for token in target_tokenized)
    return flair_postag_dict

def get_textblob_postag_dict(target=""):
    ''' 
    Get textblob pos tags 
    '''
    blob = TextBlob(target)
    textblob_postag_dict = dict(blob.tags)
    return textblob_postag_dict

def get_postag(
        get_postag_dict,
        source="", 
        target="", 
        model_path="musfiqdehan/bn-en-word-aligner"):
    """Get Spacy PoS Tags and return a Markdown table"""

    sent_src, sent_tgt, align_words = get_alignment_mapping(
        source=source, target=target, model_path=model_path
    )
    postag_dict = get_postag_dict(target=target)

    mapped_sent_src = []

    html_table = '''
                    <table>
                        <thead>
                            <th>Bangla</th>
                            <th>English</th>
                            <th>PoS Tags</th>
                        </thead>
                '''

    for i, j in sorted(align_words):
        punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~"""
        if sent_src[i] in punc or sent_tgt[j] in punc:
            mapped_sent_src.append(sent_src[i])

            html_table += f'''
                            <tbody>
                                <tr>
                                    <td> {sent_src[i]} </td>
                                    <td> {sent_tgt[j]} </td>
                                    <td> PUNC </td>
                                </tr>
                            '''
        else:
            mapped_sent_src.append(sent_src[i])

            html_table += f'''
                            <tr>
                                <td> {sent_src[i]} </td>
                                <td> {sent_tgt[j]} </td>
                                <td> {postag_dict[sent_tgt[j]]} </td>
                            </tr>
                            '''

    unks = list(set(sent_src).difference(set(mapped_sent_src)))
    for word in unks:

        html_table += f'''
                        <tr>
                            <td> {word} </td>
                            <td> N/A </td>
                            <td> UNK </td>
                        </tr>                         
                    '''
        
    html_table += '''
                        </tbody>
                    </table>
                '''
    
    pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src))
    pos_accuracy = f"{pos_accuracy:0.2%}"

    return html_table, pos_accuracy


def select_pos_tagger(src, tgt, tagger):
    ''' 
    Select the PoS tagger 
    '''

    result = None
    pos_accuracy = None

    if tagger == "spaCy":
        result, pos_accuracy = get_postag(
            get_spacy_postag_dict,
            source=src,
            target=tgt,
            model_path="musfiqdehan/bn-en-word-aligner", 
        )
    elif tagger == "NLTK":
        result, pos_accuracy = get_postag(
            get_nltk_postag_dict,
            source=src,
            target=tgt,
            model_path="musfiqdehan/bn-en-word-aligner", 
        )
    elif tagger == "Flair":
        result, pos_accuracy = get_postag(
            get_flair_postag_dict,
            source=src,
            target=tgt,
            model_path="musfiqdehan/bn-en-word-aligner", 
        )
    elif tagger == "TextBlob":
        result, pos_accuracy = get_postag(
            get_textblob_postag_dict,
            source=src,
            target=tgt,
            model_path="musfiqdehan/bn-en-word-aligner", 
        )
    return result, pos_accuracy