wing-nus kirinzhu commited on
Commit
6b1a980
1 Parent(s): 827202a

Upload dataset_extraction.py (#11)

Browse files

- Upload dataset_extraction.py (3037a7c96c513edd7a68b8e32bdd57bdad5a64d4)


Co-authored-by: Linxiao Zhu <kirinzhu@users.noreply.huggingface.co>

Files changed (1) hide show
  1. dataset_extraction.py +44 -0
dataset_extraction.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ import nltk
4
+ from SciAssist import DatasetExtraction
5
+
6
+ device = "gpu" if torch.cuda.is_available() else "cpu"
7
+ de_pipeline = DatasetExtraction(os_name="nt")
8
+
9
+
10
+ def de_for_str(input):
11
+ list_input = nltk.sent_tokenize(input)
12
+ results = de_pipeline.extract(list_input, type="str", save_results=False)
13
+
14
+ # output = []
15
+ # for res in results["dataset_mentions"]:
16
+ # output.append(f"{res}\n\n")
17
+ # return "".join(output)
18
+
19
+ output = []
20
+ for mention_pair in results["dataset_mentions"]:
21
+ output.append((mention_pair[0], mention_pair[1]))
22
+ output.append(("\n\n", None))
23
+ return output
24
+
25
+ def de_for_file(input):
26
+ if input == None:
27
+ return None
28
+ filename = input.name
29
+ # Identify the format of input and parse reference strings
30
+ if filename[-4:] == ".txt":
31
+ results = de_pipeline.extract(filename, type="txt", save_results=False)
32
+ elif filename[-4:] == ".pdf":
33
+ results = de_pipeline.extract(filename, type="pdf", save_results=False)
34
+ else:
35
+ return [("File Format Error !", None)]
36
+
37
+ output = []
38
+ for mention_pair in results["dataset_mentions"]:
39
+ output.append((mention_pair[0], mention_pair[1]))
40
+ output.append(("\n\n", None))
41
+ return output
42
+
43
+
44
+ de_str_example = "BAKIS incorporates information derived from the bank balance sheets and supervisory reports of all German banks ."