File size: 3,987 Bytes
e7df576
08725fa
e7df576
8d8fdc8
08725fa
07d1047
 
8d8fdc8
4d4c989
07d1047
8d8fdc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07d1047
 
8d8fdc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07d1047
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from nltk.tokenize import word_tokenize
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# use @st.cache decorator to cache model — because it is too large, we do not want to reload it every time
# use allow_output_mutation = True to tell streamlit that model should be treated as immutable object — singleton
@st.cache(allow_output_mutation = True)

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("yeshpanovrustem/xlm-roberta-large-ner-kazakh")
model = AutoModelForTokenClassification.from_pretrained("yeshpanovrustem/xlm-roberta-large-ner-kazakh")

labels_dict = {0: 'O', 
               1: 'B-ADAGE', 
               2: 'I-ADAGE', 
               3: 'B-ART', 
               4: 'I-ART', 
               5: 'B-CARDINAL', 
               6: 'I-CARDINAL', 
               7: 'B-CONTACT', 
               8: 'I-CONTACT', 
               9: 'B-DATE', 
               10: 'I-DATE', 
               11: 'B-DISEASE', 
               12: 'I-DISEASE', 
               13: 'B-EVENT', 
               14: 'I-EVENT', 
               15: 'B-FACILITY', 
               16: 'I-FACILITY', 
               17: 'B-GPE', 
               18: 'I-GPE', 
               19: 'B-LANGUAGE', 
               20: 'I-LANGUAGE', 
               21: 'B-LAW', 
               22: 'I-LAW', 
               23: 'B-LOCATION', 
               24: 'I-LOCATION', 
               25: 'B-MISCELLANEOUS', 
               26: 'I-MISCELLANEOUS', 
               27: 'B-MONEY', 
               28: 'I-MONEY', 
               29: 'B-NON_HUMAN', 
               30: 'I-NON_HUMAN', 
               31: 'B-NORP', 
               32: 'I-NORP', 
               33: 'B-ORDINAL', 
               34: 'I-ORDINAL', 
               35: 'B-ORGANISATION', 
               36: 'I-ORGANISATION', 
               37: 'B-PERSON', 
               38: 'I-PERSON', 
               39: 'B-PERCENTAGE', 
               40: 'I-PERCENTAGE', 
               41: 'B-POSITION', 
               42: 'I-POSITION', 
               43: 'B-PRODUCT', 
               44: 'I-PRODUCT', 
               45: 'B-PROJECT', 
               46: 'I-PROJECT', 
               47: 'B-QUANTITY', 
               48: 'I-QUANTITY', 
               49: 'B-TIME', 
               50: 'I-TIME'}

# # define function for ner
# def label_sentence(text):
# load pipeline
nlp = pipeline("ner", model = model, tokenizer = tokenizer)
example = "Қазақстан Республикасы — Шығыс Еуропа мен Орталық Азияда орналасқан мемлекет."

single_sentence_tokens = word_tokenize(example)
tokenized_input = tokenizer(single_sentence_tokens, is_split_into_words = True, return_tensors = "pt")
tokens = tokenized_input.tokens()
output = model(**tokenized_input).logits
predictions = torch.argmax(output, dim = 2)

# convert label IDs to label names
word_ids = tokenized_input.word_ids(batch_index = 0)
#     print(count, word_ids)
previous_word_id = None
labels = []
for token, word_id, prediction in zip(tokens, word_ids, predictions[0].numpy()):
#         # Special tokens have a word id that is None. We set the label to -100 so they are
#         # automatically ignored in the loss function.
#         print(token, word_id, prediction)
    if word_id is None or word_id == previous_word_id:
        continue
    elif word_id != previous_word_id:
        labels.append(labels_dict[prediction])
    previous_word_id = word_id
#     print(len(sentence_tokens), sentence_tokens)
#     print(len(labels), labels)
assert len(single_sentence_tokens) == len(labels), "Mismatch between input token and label sizes!"

for token, label in zip(single_sentence_tokens, labels):
        print(token, label)



# st.markdown("# Hello")
# # st.set_page_config(page_title = "Kazakh Named Entity Recognition", page_icon = "🔍")
# # st.title("🔍 Kazakh Named Entity Recognition")

# x = st.slider('Select a value')
# st.write(x, 'squared is', x * x)