|
import streamlit as st |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline |
|
|
|
|
|
path_to_checkpoint = 'PranavaKailash/CyNER-2.0-DeBERTa-v3-base' |
|
tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint, use_fast=True, max_length=768) |
|
model = AutoModelForTokenClassification.from_pretrained(path_to_checkpoint) |
|
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) |
|
|
|
def tag_sentence(sentence, entities_dict): |
|
""" |
|
Add HTML tags to entities for visualization. |
|
""" |
|
all_entities = sorted( |
|
[(e['start'], e['end'], e['entity'], e['word']) for ents in entities_dict.values() for e in ents], |
|
key=lambda x: x[0] |
|
) |
|
|
|
merged_entities = [] |
|
current_entity = None |
|
|
|
for start, end, entity_type, word in all_entities: |
|
if current_entity is None: |
|
current_entity = [start, end, entity_type, word] |
|
else: |
|
if start == current_entity[1] and entity_type == current_entity[2] and entity_type.startswith('I-'): |
|
current_entity[1] = end |
|
current_entity[3] += word.replace('β', ' ') |
|
else: |
|
merged_entities.append(tuple(current_entity)) |
|
current_entity = [start, end, entity_type, word] |
|
|
|
if current_entity: |
|
merged_entities.append(tuple(current_entity)) |
|
|
|
tagged_sentence = "" |
|
last_idx = 0 |
|
|
|
for start, end, entity_type, _ in merged_entities: |
|
tagged_sentence += sentence[last_idx:start] |
|
entity_tag = entity_type.replace('I-', 'B-') |
|
tagged_sentence += f"<span style='color:blue'><{entity_tag}></span>{sentence[start:end]}<span style='color:blue'>/{entity_tag}></span>" |
|
last_idx = end |
|
|
|
tagged_sentence += sentence[last_idx:] |
|
return tagged_sentence |
|
|
|
def perform_ner(text): |
|
""" |
|
Run NER pipeline and prepare results for display. |
|
""" |
|
entities = ner_pipeline(text) |
|
entities_dict = {} |
|
for entity in entities: |
|
entity_type = entity['entity'] |
|
if entity_type not in entities_dict: |
|
entities_dict[entity_type] = [] |
|
entities_dict[entity_type].append({ |
|
"entity": entity['entity'], |
|
"score": entity['score'], |
|
"index": entity['index'], |
|
"word": entity['word'], |
|
"start": entity['start'], |
|
"end": entity['end'] |
|
}) |
|
|
|
tagged_sentence = tag_sentence(text, entities_dict) |
|
return entities_dict, tagged_sentence |
|
|
|
|
|
st.title("CyNER 2.0 - Named Entity Recognition") |
|
st.write("Enter text to get named entity recognition results.") |
|
|
|
input_text = st.text_area("Input Text", "Type your text here...") |
|
|
|
if st.button("Analyze"): |
|
if input_text.strip(): |
|
entities_dict, tagged_sentence = perform_ner(input_text) |
|
|
|
|
|
st.subheader("Tagged Entities") |
|
st.markdown(tagged_sentence, unsafe_allow_html=True) |
|
|
|
st.subheader("Entities and Details") |
|
st.json(entities_dict) |
|
else: |
|
st.warning("Please enter some text for analysis.") |
|
|