address-extraction / predict.py
duoquote
Add address extraction functionality using BERT model
0255e9b
raw
history blame
2.86 kB
import torch
import orjson
from transformers import pipeline
from transformers import BertTokenizerFast, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
with open("labels.json", "r") as f:
id_to_label = {int(k): v for k, v in orjson.loads(f.read()).items()}
nlp = pipeline(
"ner",
model="./model",
tokenizer=tokenizer,
device=0 if torch.cuda.is_available() else -1,
)
def get_entities(tokens):
entities = []
entity = None
for token in tokens:
label_id = int(token["entity"][6:])
label = id_to_label[label_id]
if label.startswith("B-"):
if entity:
entity["score"] /= entity["token_count"]
entities.append(entity)
entity = {
"label": label[2:],
"ranges": [token["start"], token["end"]],
"score": token["score"],
"token_count": 1,
}
elif label.startswith("I-"):
if entity and entity["label"] == label[2:]:
entity["ranges"][1] = token["end"]
entity["token_count"] += 1
entity["score"] += token["score"]
else:
if entity:
entity["ranges"][1] = token["end"]
entity["token_count"] += 1
entity["score"] += token["score"]
entity["score"] /= entity["token_count"]
entities.append(entity)
entity = None
else:
if entity:
entity["score"] /= entity["token_count"]
entities.append(entity)
entity = None
if entity:
entity["score"] /= entity["token_count"]
entities.append(entity)
return entities
def process(text):
nlp_output = nlp(text)
entities = get_entities(nlp_output)
for entity in entities:
print(f"{text[entity['ranges'][0]:entity['ranges'][1]]:<35} {entity['label']:>15} {entity['score'] * 100:.2f}%")
print("Average Score: ", sum([token["score"] for token in nlp_output]) / len(nlp_output))
print("Labels Found: ", len(entities))
print("-" * 70)
if __name__ == "__main__":
examples = [
"Osmangazi Mahallesi, Hoca Ahmet Yesevi Cd. No:34, 16050 Osmangazi/Bursa",
"Karşıyaka Mahallesi, Mavişehir Caddesi No: 91, Daire 4, 35540 Karşıyaka/İzmir",
"Selçuklu Mahallesi, Atatürk Bulvarı No: 55, 42050 Selçuklu/Konya",
"Alsancak Mahallesi, 1475. Sk. No:3, 35220 Konak/İzmir",
"Kocatepe Mahallesi, Yaşam Caddesi 3. Sokak No:4, 06420 Bayrampaşa/İstanbul",
]
for example in examples:
print(example)
process(example)
while True:
text = input("Enter text: ")
if not text:
break
process(text)