Spaces:
Running
Running
""" | |
This module contains the functions to get PoS tags using Spacy and return a Markdown table | |
""" | |
from .alignment_mappers import get_alignment_mapping, select_model | |
from flair.models import SequenceTagger | |
from flair.data import Sentence | |
import spacy | |
from spacy.cli import download | |
download("en_core_web_sm") | |
import en_core_web_sm | |
import nltk | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
from textblob import TextBlob | |
def get_spacy_postag_dict(target=""): | |
''' | |
Get spacy pos tags | |
''' | |
nlp = en_core_web_sm.load() | |
target_tokenized = nlp(target) | |
spacy_postag_dict = dict((token.text, token.tag_) | |
for token in target_tokenized) | |
return spacy_postag_dict | |
def get_nltk_postag_dict(target=""): | |
''' | |
Get nltk pos tags | |
''' | |
target_tokenized = nltk.tokenize.word_tokenize(target) | |
nltk_postag_dict = dict((key, value) | |
for key, value in nltk.pos_tag(target_tokenized)) | |
return nltk_postag_dict | |
def get_flair_postag_dict(target=""): | |
''' | |
Get flair pos tags | |
''' | |
tagger = SequenceTagger.load("pos") | |
target_tokenized = Sentence(target) | |
tagger.predict(target_tokenized) | |
flair_postag_dict = dict((token.text, token.tag) | |
for token in target_tokenized) | |
return flair_postag_dict | |
def get_textblob_postag_dict(target=""): | |
''' | |
Get textblob pos tags | |
''' | |
blob = TextBlob(target) | |
textblob_postag_dict = dict(blob.tags) | |
return textblob_postag_dict | |
def get_postag( | |
get_postag_dict, | |
source="", | |
target="", | |
model_name="musfiqdehan/bn-en-word-aligner"): | |
"""Get Spacy PoS Tags and return a Markdown table""" | |
sent_src, sent_tgt, align_words = get_alignment_mapping( | |
source=source, target=target, model_name=model_name | |
) | |
postag_dict = get_postag_dict(target=target) | |
mapped_sent_src = [] | |
html_table = ''' | |
<table> | |
<thead> | |
<th>Bangla</th> | |
<th>English</th> | |
<th>PoS Tags</th> | |
</thead> | |
''' | |
for i, j in sorted(align_words): | |
punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~""" | |
if sent_src[i] in punc or sent_tgt[j] in punc: | |
mapped_sent_src.append(sent_src[i]) | |
html_table += f''' | |
<tbody> | |
<tr> | |
<td> {sent_src[i]} </td> | |
<td> {sent_tgt[j]} </td> | |
<td> PUNC </td> | |
</tr> | |
''' | |
else: | |
mapped_sent_src.append(sent_src[i]) | |
html_table += f''' | |
<tr> | |
<td> {sent_src[i]} </td> | |
<td> {sent_tgt[j]} </td> | |
<td> {postag_dict[sent_tgt[j]]} </td> | |
</tr> | |
''' | |
unks = list(set(sent_src).difference(set(mapped_sent_src))) | |
for word in unks: | |
html_table += f''' | |
<tr> | |
<td> {word} </td> | |
<td> N/A </td> | |
<td> UNK </td> | |
</tr> | |
''' | |
html_table += ''' | |
</tbody> | |
</table> | |
''' | |
pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src)) | |
pos_accuracy = f"{pos_accuracy:0.2%}" | |
return html_table, pos_accuracy | |
def select_pos_tagger(src, tgt, model_name, tagger): | |
''' | |
Select the PoS tagger | |
''' | |
result = None | |
pos_accuracy = None | |
model_name = select_model(model_name) | |
if tagger == "spaCy": | |
result, pos_accuracy = get_postag( | |
get_spacy_postag_dict, | |
source=src, | |
target=tgt, | |
model_name=model_name, | |
) | |
elif tagger == "NLTK": | |
result, pos_accuracy = get_postag( | |
get_nltk_postag_dict, | |
source=src, | |
target=tgt, | |
model_name=model_name, | |
) | |
elif tagger == "Flair": | |
result, pos_accuracy = get_postag( | |
get_flair_postag_dict, | |
source=src, | |
target=tgt, | |
model_name=model_name, | |
) | |
elif tagger == "TextBlob": | |
result, pos_accuracy = get_postag( | |
get_textblob_postag_dict, | |
source=src, | |
target=tgt, | |
model_name=model_name, | |
) | |
return result, pos_accuracy | |