import torch import orjson from transformers import pipeline from transformers import BertTokenizerFast, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased") nlp = pipeline( "ner", model="./model", tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1, ) def get_entities(tokens): entities = [] entity = None for token in tokens: if token["entity"].startswith("B-"): if entity: entity["score"] /= entity["token_count"] entities.append(entity) entity = { "label": token["entity"][2:], "ranges": [token["start"], token["end"]], "score": token["score"], "token_count": 1, } elif token["entity"].startswith("I-"): if entity and entity["label"] == token["entity"][2:]: entity["ranges"][1] = token["end"] entity["token_count"] += 1 entity["score"] += token["score"] else: if entity: entity["ranges"][1] = token["end"] entity["token_count"] += 1 entity["score"] += token["score"] entity["score"] /= entity["token_count"] entities.append(entity) entity = None else: if entity: entity["score"] /= entity["token_count"] entities.append(entity) entity = None if entity: entity["score"] /= entity["token_count"] entities.append(entity) return entities def process(text): nlp_output = nlp(text) entities = get_entities(nlp_output) for entity in entities: print(f"{text[entity['ranges'][0]:entity['ranges'][1]]:<35} {entity['label']:>15} {entity['score'] * 100:.2f}%") print("Average Score: ", sum([token["score"] for token in nlp_output]) / len(nlp_output)) print("Labels Found: ", len(entities)) print("-" * 70) if __name__ == "__main__": examples = [ "Osmangazi Mahallesi, Hoca Ahmet Yesevi Cd. No:34, 16050 Osmangazi/Bursa", "Karşıyaka Mahallesi, Mavişehir Caddesi No: 91, Daire 4, 35540 Karşıyaka/İzmir", "Selçuklu Mahallesi, Atatürk Bulvarı No: 55, 42050 Selçuklu/Konya", "Alsancak Mahallesi, 1475. Sk. No:3, 35220 Konak/İzmir", "Kocatepe Mahallesi, Yaşam Caddesi 3. Sokak No:4, 06420 Bayrampaşa/İstanbul", ] for example in examples: print(example) process(example) while True: text = input("Enter text: ") if not text: break process(text)