File size: 2,859 Bytes

0255e9b

import torch
import orjson
from transformers import pipeline
from transformers import BertTokenizerFast, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

with open("labels.json", "r") as f:
    id_to_label = {int(k): v for k, v in orjson.loads(f.read()).items()}

nlp = pipeline(
    "ner",
    model="./model",
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

def get_entities(tokens):
    entities = []
    entity = None
    for token in tokens:
        label_id = int(token["entity"][6:])
        label = id_to_label[label_id]
        if label.startswith("B-"):
            if entity:
                entity["score"] /= entity["token_count"]
                entities.append(entity)
            entity = {
                "label": label[2:],
                "ranges": [token["start"], token["end"]],
                "score": token["score"],
                "token_count": 1,
            }
        elif label.startswith("I-"):
            if entity and entity["label"] == label[2:]:
                entity["ranges"][1] = token["end"]
                entity["token_count"] += 1
                entity["score"] += token["score"]
            else:
                if entity:
                    entity["ranges"][1] = token["end"]
                    entity["token_count"] += 1
                    entity["score"] += token["score"]
                    entity["score"] /= entity["token_count"]
                    entities.append(entity)
                entity = None
        else:
            if entity:
                entity["score"] /= entity["token_count"]
                entities.append(entity)
            entity = None
    if entity:
        entity["score"] /= entity["token_count"]
        entities.append(entity)
    return entities

def process(text):
    nlp_output = nlp(text)
    entities = get_entities(nlp_output)
    for entity in entities:
        print(f"{text[entity['ranges'][0]:entity['ranges'][1]]:<35} {entity['label']:>15} {entity['score'] * 100:.2f}%")
    print("Average Score: ", sum([token["score"] for token in nlp_output]) / len(nlp_output))
    print("Labels Found: ", len(entities))
    print("-" * 70)

if __name__ == "__main__":
    examples = [
        "Osmangazi Mahallesi, Hoca Ahmet Yesevi Cd. No:34, 16050 Osmangazi/Bursa",
        "Karşıyaka Mahallesi, Mavişehir Caddesi No: 91, Daire 4, 35540 Karşıyaka/İzmir",
        "Selçuklu Mahallesi, Atatürk Bulvarı No: 55, 42050 Selçuklu/Konya",
        "Alsancak Mahallesi, 1475. Sk. No:3, 35220 Konak/İzmir",
        "Kocatepe Mahallesi, Yaşam Caddesi 3. Sokak No:4, 06420 Bayrampaşa/İstanbul",
    ]
    for example in examples:
        print(example)
        process(example)
    while True:
        text = input("Enter text: ")
        if not text:
            break
        process(text)