andrewgleave commited on
Commit
38788ba
1 Parent(s): 557942e

Add simple process script

Browse files
Files changed (1) hide show
  1. process.py +61 -0
process.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import csv
3
+ import json
4
+
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
6
+
7
+ MODEL = "d4data/biomedical-ner-all"
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
10
+ model = AutoModelForTokenClassification.from_pretrained(MODEL)
11
+
12
+ pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
13
+
14
+
15
+ def process(*args):
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument('--notes', help='Notes CSV', required=True)
18
+ parser.add_argument('--out', help='Output', required=True)
19
+ args = parser.parse_args()
20
+
21
+ filepath = args.notes
22
+ outpath = args.out
23
+
24
+ if not filepath.endswith(".csv"):
25
+ raise ValueError("Filepath must be a .csv file.")
26
+
27
+ if not outpath.endswith(".json"):
28
+ raise ValueError("Output path must be a .json file.")
29
+
30
+ processed = []
31
+ with open(filepath, "r") as f:
32
+ reader = csv.DictReader(f)
33
+ for row in reader:
34
+ text = row["text"]
35
+ raw = pipe(text)
36
+ # do something with `raw` here e.g. save to file
37
+ ner_content = {
38
+ "text": text,
39
+ "score": row["score"],
40
+ "entities": [
41
+ {
42
+ "entity": x["entity_group"],
43
+ "word": x["word"],
44
+ "score": float(x["score"]),
45
+ "start": x["start"],
46
+ "end": x["end"],
47
+ }
48
+ for x in raw
49
+ ],
50
+ }
51
+ processed.append(ner_content)
52
+
53
+ # write as json to file
54
+ with open(outpath, "w") as f:
55
+ json.dump(processed, f)
56
+
57
+
58
+ if __name__ == "__main__":
59
+ import sys
60
+
61
+ process(*sys.argv[1:])