Spaces:

andrewgleave
/

note-ner-demo

Running

note-ner-demo / process.py

Add case to output

cfb3ccc over 1 year ago

1.87 kB

	import argparse
	import csv
	import json

	from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

	MODEL = "d4data/biomedical-ner-all"

	tokenizer = AutoTokenizer.from_pretrained(MODEL)
	model = AutoModelForTokenClassification.from_pretrained(MODEL)

	pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


	def process(*args):
	parser = argparse.ArgumentParser()
	parser.add_argument('--notes', help='Notes CSV', required=True)
	parser.add_argument('--out', help='Output', required=True)
	args = parser.parse_args()

	filepath = args.notes
	outpath = args.out

	if not filepath.endswith(".csv"):
	raise ValueError("Filepath must be a .csv file.")

	if not outpath.endswith(".json"):
	raise ValueError("Output path must be a .json file.")

	processed = []
	with open(filepath, "r") as f:
	reader = csv.DictReader(f)
	for row in reader:
	text = row["text"]
	raw = pipe(text)
	# do something with `raw` here e.g. save to file
	ner_content = {
	# "text": text,
	"score": row["score"],
	"student_id": row["student_id"],
	"case": row["case"],
	"entities": [
	{
	"entity": x["entity_group"],
	"word": x["word"],
	"score": round(float(x["score"]), 2),
	"start": x["start"],
	"end": x["end"],
	}
	for x in raw
	],
	}
	processed.append(ner_content)

	# write as json to file
	with open(outpath, "w") as f:
	json.dump(processed, f)


	if __name__ == "__main__":
	import sys

	process(*sys.argv[1:])