|
import torch |
|
import orjson |
|
from transformers import pipeline |
|
from transformers import BertTokenizerFast, AutoTokenizer |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased") |
|
|
|
with open("labels.json", "r") as f: |
|
id_to_label = {int(k): v for k, v in orjson.loads(f.read()).items()} |
|
|
|
nlp = pipeline( |
|
"ner", |
|
model="./model", |
|
tokenizer=tokenizer, |
|
device=0 if torch.cuda.is_available() else -1, |
|
) |
|
|
|
def get_entities(tokens): |
|
entities = [] |
|
entity = None |
|
for token in tokens: |
|
label_id = int(token["entity"][6:]) |
|
label = id_to_label[label_id] |
|
if label.startswith("B-"): |
|
if entity: |
|
entity["score"] /= entity["token_count"] |
|
entities.append(entity) |
|
entity = { |
|
"label": label[2:], |
|
"ranges": [token["start"], token["end"]], |
|
"score": token["score"], |
|
"token_count": 1, |
|
} |
|
elif label.startswith("I-"): |
|
if entity and entity["label"] == label[2:]: |
|
entity["ranges"][1] = token["end"] |
|
entity["token_count"] += 1 |
|
entity["score"] += token["score"] |
|
else: |
|
if entity: |
|
entity["ranges"][1] = token["end"] |
|
entity["token_count"] += 1 |
|
entity["score"] += token["score"] |
|
entity["score"] /= entity["token_count"] |
|
entities.append(entity) |
|
entity = None |
|
else: |
|
if entity: |
|
entity["score"] /= entity["token_count"] |
|
entities.append(entity) |
|
entity = None |
|
if entity: |
|
entity["score"] /= entity["token_count"] |
|
entities.append(entity) |
|
return entities |
|
|
|
def process(text): |
|
nlp_output = nlp(text) |
|
entities = get_entities(nlp_output) |
|
for entity in entities: |
|
print(f"{text[entity['ranges'][0]:entity['ranges'][1]]:<35} {entity['label']:>15} {entity['score'] * 100:.2f}%") |
|
print("Average Score: ", sum([token["score"] for token in nlp_output]) / len(nlp_output)) |
|
print("Labels Found: ", len(entities)) |
|
print("-" * 70) |
|
|
|
if __name__ == "__main__": |
|
examples = [ |
|
"Osmangazi Mahallesi, Hoca Ahmet Yesevi Cd. No:34, 16050 Osmangazi/Bursa", |
|
"Karşıyaka Mahallesi, Mavişehir Caddesi No: 91, Daire 4, 35540 Karşıyaka/İzmir", |
|
"Selçuklu Mahallesi, Atatürk Bulvarı No: 55, 42050 Selçuklu/Konya", |
|
"Alsancak Mahallesi, 1475. Sk. No:3, 35220 Konak/İzmir", |
|
"Kocatepe Mahallesi, Yaşam Caddesi 3. Sokak No:4, 06420 Bayrampaşa/İstanbul", |
|
] |
|
for example in examples: |
|
print(example) |
|
process(example) |
|
while True: |
|
text = input("Enter text: ") |
|
if not text: |
|
break |
|
process(text) |