Spaces:
Running
Running
import argparse | |
import csv | |
import json | |
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification | |
MODEL = "d4data/biomedical-ner-all" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
model = AutoModelForTokenClassification.from_pretrained(MODEL) | |
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
def process(*args): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--notes', help='Notes CSV', required=True) | |
parser.add_argument('--out', help='Output', required=True) | |
args = parser.parse_args() | |
filepath = args.notes | |
outpath = args.out | |
if not filepath.endswith(".csv"): | |
raise ValueError("Filepath must be a .csv file.") | |
if not outpath.endswith(".json"): | |
raise ValueError("Output path must be a .json file.") | |
processed = [] | |
with open(filepath, "r") as f: | |
reader = csv.DictReader(f) | |
for row in reader: | |
text = row["text"] | |
raw = pipe(text) | |
# do something with `raw` here e.g. save to file | |
ner_content = { | |
# "text": text, | |
"score": row["score"], | |
"student_id": row["student_id"], | |
"case": row["case"], | |
"entities": [ | |
{ | |
"entity": x["entity_group"], | |
"word": x["word"], | |
"score": round(float(x["score"]), 2), | |
"start": x["start"], | |
"end": x["end"], | |
} | |
for x in raw | |
], | |
} | |
processed.append(ner_content) | |
# write as json to file | |
with open(outpath, "w") as f: | |
json.dump(processed, f) | |
if __name__ == "__main__": | |
import sys | |
process(*sys.argv[1:]) | |