Spaces:
Runtime error
Runtime error
File size: 4,791 Bytes
407b426 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""
This module contains the functions to get PoS tags using Spacy and return a Markdown table
"""
from .alignment_mappers import get_alignment_mapping
from flair.models import SequenceTagger
from flair.data import Sentence
import spacy
from spacy.cli import download
download("en_core_web_sm")
import en_core_web_sm
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
def get_spacy_postag_dict(target=""):
'''
Get spacy pos tags
'''
nlp = en_core_web_sm.load()
target_tokenized = nlp(target)
spacy_postag_dict = dict((token.text, token.tag_)
for token in target_tokenized)
return spacy_postag_dict
def get_nltk_postag_dict(target=""):
'''
Get nltk pos tags
'''
target_tokenized = nltk.tokenize.word_tokenize(target)
nltk_postag_dict = dict((key, value)
for key, value in nltk.pos_tag(target_tokenized))
return nltk_postag_dict
def get_flair_postag_dict(target=""):
'''
Get flair pos tags
'''
tagger = SequenceTagger.load("pos")
target_tokenized = Sentence(target)
tagger.predict(target_tokenized)
flair_postag_dict = dict((token.text, token.tag)
for token in target_tokenized)
return flair_postag_dict
def get_textblob_postag_dict(target=""):
'''
Get textblob pos tags
'''
blob = TextBlob(target)
textblob_postag_dict = dict(blob.tags)
return textblob_postag_dict
def get_postag(
get_postag_dict,
source="",
target="",
model_path="musfiqdehan/bn-en-word-aligner"):
"""Get Spacy PoS Tags and return a Markdown table"""
sent_src, sent_tgt, align_words = get_alignment_mapping(
source=source, target=target, model_path=model_path
)
postag_dict = get_postag_dict(target=target)
mapped_sent_src = []
html_table = '''
<table>
<thead>
<th>Bangla</th>
<th>English</th>
<th>PoS Tags</th>
</thead>
'''
for i, j in sorted(align_words):
punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~"""
if sent_src[i] in punc or sent_tgt[j] in punc:
mapped_sent_src.append(sent_src[i])
html_table += f'''
<tbody>
<tr>
<td> {sent_src[i]} </td>
<td> {sent_tgt[j]} </td>
<td> PUNC </td>
</tr>
'''
else:
mapped_sent_src.append(sent_src[i])
html_table += f'''
<tr>
<td> {sent_src[i]} </td>
<td> {sent_tgt[j]} </td>
<td> {postag_dict[sent_tgt[j]]} </td>
</tr>
'''
unks = list(set(sent_src).difference(set(mapped_sent_src)))
for word in unks:
html_table += f'''
<tr>
<td> {word} </td>
<td> N/A </td>
<td> UNK </td>
</tr>
'''
html_table += '''
</tbody>
</table>
'''
pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src))
pos_accuracy = f"{pos_accuracy:0.2%}"
return html_table, pos_accuracy
def select_pos_tagger(src, tgt, tagger):
'''
Select the PoS tagger
'''
result = None
pos_accuracy = None
if tagger == "spaCy":
result, pos_accuracy = get_postag(
get_spacy_postag_dict,
source=src,
target=tgt,
model_path="musfiqdehan/bn-en-word-aligner",
)
elif tagger == "NLTK":
result, pos_accuracy = get_postag(
get_nltk_postag_dict,
source=src,
target=tgt,
model_path="musfiqdehan/bn-en-word-aligner",
)
elif tagger == "Flair":
result, pos_accuracy = get_postag(
get_flair_postag_dict,
source=src,
target=tgt,
model_path="musfiqdehan/bn-en-word-aligner",
)
elif tagger == "TextBlob":
result, pos_accuracy = get_postag(
get_textblob_postag_dict,
source=src,
target=tgt,
model_path="musfiqdehan/bn-en-word-aligner",
)
return result, pos_accuracy
|