IndicNERdupe

Sleeping

App Files Files Community

Shanks0465 commited on Jan 4, 2023

Commit

d9ae7cd

•

1 Parent(s): 33f37c9

Added app.py

Browse files

Files changed (2) hide show

app.py +44 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
+model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")
+def get_ner(sentence):
+    tok_sentence = tokenizer(sentence, return_tensors='pt')
+    with torch.no_grad():
+        logits = model(**tok_sentence).logits.argmax(-1)
+        predicted_tokens_classes = [
+            model.config.id2label[t.item()] for t in logits[0]]
+        predicted_labels = []
+        previous_token_id = 0
+        word_ids = tok_sentence.word_ids()
+        for word_index in range(len(word_ids)):
+            if word_ids[word_index] == None:
+                previous_token_id = word_ids[word_index]
+            elif word_ids[word_index] == previous_token_id:
+                previous_token_id = word_ids[word_index]
+            else:
+                predicted_labels.append(predicted_tokens_classes[word_index])
+                previous_token_id = word_ids[word_index]
+        ner_output = []
+        for index in range(len(sentence.split(' '))):
+            ner_output.append(
+                (sentence.split(' ')[index], predicted_labels[index]))
+        return ner_output
+iface = gr.Interface(get_ner,
+                     gr.Textbox(placeholder="Enter sentence here..."),
+                     ["highlight"], examples=['लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं', 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'], title='IndicNER',
+                     article='IndicNER is a model trained to complete the task of identifying named entities from sentences in Indian languages. Our model is specifically fine-tuned to the 11 Indian languages mentioned above over millions of sentences. The model is then benchmarked over a human annotated testset and multiple other publicly available Indian NER datasets. The 11 languages covered by IndicNER are: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Oriya, Punjabi, Tamil, Telugu.'
+                     )
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+torch
+sentencepiece==0.1.95
+datasets
+seqeval