Spaces:
Build error
Build error
spaces init
Browse files- LICENSE +674 -0
- README.md +267 -13
- app.py +58 -0
- arxiv_public_data/__init__.py +0 -0
- arxiv_public_data/__pycache__/__init__.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/config.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/fulltext.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc +0 -0
- arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc +0 -0
- arxiv_public_data/authors.py +469 -0
- arxiv_public_data/config.py +55 -0
- arxiv_public_data/embeddings/__init__.py +0 -0
- arxiv_public_data/embeddings/tf_hub.py +185 -0
- arxiv_public_data/embeddings/util.py +151 -0
- arxiv_public_data/fixunicode.py +108 -0
- arxiv_public_data/fulltext.py +349 -0
- arxiv_public_data/internal_citations.py +128 -0
- arxiv_public_data/oai_metadata.py +282 -0
- arxiv_public_data/pdfstamp.py +83 -0
- arxiv_public_data/regex_arxiv.py +195 -0
- arxiv_public_data/s3_bulk_download.py +397 -0
- arxiv_public_data/slice_pdfs.py +93 -0
- arxiv_public_data/tex2utf.py +206 -0
- logo.png +0 -0
- requirements.txt +22 -0
- setup.py +89 -0
- src/Auto_Research.egg-info/PKG-INFO +313 -0
- src/Auto_Research.egg-info/SOURCES.txt +10 -0
- src/Auto_Research.egg-info/dependency_links.txt +2 -0
- src/Auto_Research.egg-info/entry_points.txt +2 -0
- src/Auto_Research.egg-info/requires.txt +24 -0
- src/Auto_Research.egg-info/top_level.txt +1 -0
- src/Surveyor.py +1518 -0
- src/__pycache__/Surveyor.cpython-310.pyc +0 -0
- src/__pycache__/defaults.cpython-310.pyc +0 -0
- src/defaults.py +20 -0
- src/packages.txt +0 -0
- survey.py +72 -0
- tests/__init__.py +0 -0
- tests/__pycache__/__init__.cpython-310.pyc +0 -0
- tests/__pycache__/test_survey_files.cpython-310-pytest-7.1.2.pyc +0 -0
- tests/test_survey_files.py +10 -0
LICENSE
ADDED
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU GENERAL PUBLIC LICENSE
|
2 |
+
Version 3, 29 June 2007
|
3 |
+
|
4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
6 |
+
of this license document, but changing it is not allowed.
|
7 |
+
|
8 |
+
Preamble
|
9 |
+
|
10 |
+
The GNU General Public License is a free, copyleft license for
|
11 |
+
software and other kinds of works.
|
12 |
+
|
13 |
+
The licenses for most software and other practical works are designed
|
14 |
+
to take away your freedom to share and change the works. By contrast,
|
15 |
+
the GNU General Public License is intended to guarantee your freedom to
|
16 |
+
share and change all versions of a program--to make sure it remains free
|
17 |
+
software for all its users. We, the Free Software Foundation, use the
|
18 |
+
GNU General Public License for most of our software; it applies also to
|
19 |
+
any other work released this way by its authors. You can apply it to
|
20 |
+
your programs, too.
|
21 |
+
|
22 |
+
When we speak of free software, we are referring to freedom, not
|
23 |
+
price. Our General Public Licenses are designed to make sure that you
|
24 |
+
have the freedom to distribute copies of free software (and charge for
|
25 |
+
them if you wish), that you receive source code or can get it if you
|
26 |
+
want it, that you can change the software or use pieces of it in new
|
27 |
+
free programs, and that you know you can do these things.
|
28 |
+
|
29 |
+
To protect your rights, we need to prevent others from denying you
|
30 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
31 |
+
certain responsibilities if you distribute copies of the software, or if
|
32 |
+
you modify it: responsibilities to respect the freedom of others.
|
33 |
+
|
34 |
+
For example, if you distribute copies of such a program, whether
|
35 |
+
gratis or for a fee, you must pass on to the recipients the same
|
36 |
+
freedoms that you received. You must make sure that they, too, receive
|
37 |
+
or can get the source code. And you must show them these terms so they
|
38 |
+
know their rights.
|
39 |
+
|
40 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
41 |
+
(1) assert copyright on the software, and (2) offer you this License
|
42 |
+
giving you legal permission to copy, distribute and/or modify it.
|
43 |
+
|
44 |
+
For the developers' and authors' protection, the GPL clearly explains
|
45 |
+
that there is no warranty for this free software. For both users' and
|
46 |
+
authors' sake, the GPL requires that modified versions be marked as
|
47 |
+
changed, so that their problems will not be attributed erroneously to
|
48 |
+
authors of previous versions.
|
49 |
+
|
50 |
+
Some devices are designed to deny users access to install or run
|
51 |
+
modified versions of the software inside them, although the manufacturer
|
52 |
+
can do so. This is fundamentally incompatible with the aim of
|
53 |
+
protecting users' freedom to change the software. The systematic
|
54 |
+
pattern of such abuse occurs in the area of products for individuals to
|
55 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
56 |
+
have designed this version of the GPL to prohibit the practice for those
|
57 |
+
products. If such problems arise substantially in other domains, we
|
58 |
+
stand ready to extend this provision to those domains in future versions
|
59 |
+
of the GPL, as needed to protect the freedom of users.
|
60 |
+
|
61 |
+
Finally, every program is threatened constantly by software patents.
|
62 |
+
States should not allow patents to restrict development and use of
|
63 |
+
software on general-purpose computers, but in those that do, we wish to
|
64 |
+
avoid the special danger that patents applied to a free program could
|
65 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
66 |
+
patents cannot be used to render the program non-free.
|
67 |
+
|
68 |
+
The precise terms and conditions for copying, distribution and
|
69 |
+
modification follow.
|
70 |
+
|
71 |
+
TERMS AND CONDITIONS
|
72 |
+
|
73 |
+
0. Definitions.
|
74 |
+
|
75 |
+
"This License" refers to version 3 of the GNU General Public License.
|
76 |
+
|
77 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
78 |
+
works, such as semiconductor masks.
|
79 |
+
|
80 |
+
"The Program" refers to any copyrightable work licensed under this
|
81 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
82 |
+
"recipients" may be individuals or organizations.
|
83 |
+
|
84 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
85 |
+
in a fashion requiring copyright permission, other than the making of an
|
86 |
+
exact copy. The resulting work is called a "modified version" of the
|
87 |
+
earlier work or a work "based on" the earlier work.
|
88 |
+
|
89 |
+
A "covered work" means either the unmodified Program or a work based
|
90 |
+
on the Program.
|
91 |
+
|
92 |
+
To "propagate" a work means to do anything with it that, without
|
93 |
+
permission, would make you directly or secondarily liable for
|
94 |
+
infringement under applicable copyright law, except executing it on a
|
95 |
+
computer or modifying a private copy. Propagation includes copying,
|
96 |
+
distribution (with or without modification), making available to the
|
97 |
+
public, and in some countries other activities as well.
|
98 |
+
|
99 |
+
To "convey" a work means any kind of propagation that enables other
|
100 |
+
parties to make or receive copies. Mere interaction with a user through
|
101 |
+
a computer network, with no transfer of a copy, is not conveying.
|
102 |
+
|
103 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
104 |
+
to the extent that it includes a convenient and prominently visible
|
105 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
106 |
+
tells the user that there is no warranty for the work (except to the
|
107 |
+
extent that warranties are provided), that licensees may convey the
|
108 |
+
work under this License, and how to view a copy of this License. If
|
109 |
+
the interface presents a list of user commands or options, such as a
|
110 |
+
menu, a prominent item in the list meets this criterion.
|
111 |
+
|
112 |
+
1. Source Code.
|
113 |
+
|
114 |
+
The "source code" for a work means the preferred form of the work
|
115 |
+
for making modifications to it. "Object code" means any non-source
|
116 |
+
form of a work.
|
117 |
+
|
118 |
+
A "Standard Interface" means an interface that either is an official
|
119 |
+
standard defined by a recognized standards body, or, in the case of
|
120 |
+
interfaces specified for a particular programming language, one that
|
121 |
+
is widely used among developers working in that language.
|
122 |
+
|
123 |
+
The "System Libraries" of an executable work include anything, other
|
124 |
+
than the work as a whole, that (a) is included in the normal form of
|
125 |
+
packaging a Major Component, but which is not part of that Major
|
126 |
+
Component, and (b) serves only to enable use of the work with that
|
127 |
+
Major Component, or to implement a Standard Interface for which an
|
128 |
+
implementation is available to the public in source code form. A
|
129 |
+
"Major Component", in this context, means a major essential component
|
130 |
+
(kernel, window system, and so on) of the specific operating system
|
131 |
+
(if any) on which the executable work runs, or a compiler used to
|
132 |
+
produce the work, or an object code interpreter used to run it.
|
133 |
+
|
134 |
+
The "Corresponding Source" for a work in object code form means all
|
135 |
+
the source code needed to generate, install, and (for an executable
|
136 |
+
work) run the object code and to modify the work, including scripts to
|
137 |
+
control those activities. However, it does not include the work's
|
138 |
+
System Libraries, or general-purpose tools or generally available free
|
139 |
+
programs which are used unmodified in performing those activities but
|
140 |
+
which are not part of the work. For example, Corresponding Source
|
141 |
+
includes interface definition files associated with source files for
|
142 |
+
the work, and the source code for shared libraries and dynamically
|
143 |
+
linked subprograms that the work is specifically designed to require,
|
144 |
+
such as by intimate data communication or control flow between those
|
145 |
+
subprograms and other parts of the work.
|
146 |
+
|
147 |
+
The Corresponding Source need not include anything that users
|
148 |
+
can regenerate automatically from other parts of the Corresponding
|
149 |
+
Source.
|
150 |
+
|
151 |
+
The Corresponding Source for a work in source code form is that
|
152 |
+
same work.
|
153 |
+
|
154 |
+
2. Basic Permissions.
|
155 |
+
|
156 |
+
All rights granted under this License are granted for the term of
|
157 |
+
copyright on the Program, and are irrevocable provided the stated
|
158 |
+
conditions are met. This License explicitly affirms your unlimited
|
159 |
+
permission to run the unmodified Program. The output from running a
|
160 |
+
covered work is covered by this License only if the output, given its
|
161 |
+
content, constitutes a covered work. This License acknowledges your
|
162 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
163 |
+
|
164 |
+
You may make, run and propagate covered works that you do not
|
165 |
+
convey, without conditions so long as your license otherwise remains
|
166 |
+
in force. You may convey covered works to others for the sole purpose
|
167 |
+
of having them make modifications exclusively for you, or provide you
|
168 |
+
with facilities for running those works, provided that you comply with
|
169 |
+
the terms of this License in conveying all material for which you do
|
170 |
+
not control copyright. Those thus making or running the covered works
|
171 |
+
for you must do so exclusively on your behalf, under your direction
|
172 |
+
and control, on terms that prohibit them from making any copies of
|
173 |
+
your copyrighted material outside their relationship with you.
|
174 |
+
|
175 |
+
Conveying under any other circumstances is permitted solely under
|
176 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
177 |
+
makes it unnecessary.
|
178 |
+
|
179 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
180 |
+
|
181 |
+
No covered work shall be deemed part of an effective technological
|
182 |
+
measure under any applicable law fulfilling obligations under article
|
183 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
184 |
+
similar laws prohibiting or restricting circumvention of such
|
185 |
+
measures.
|
186 |
+
|
187 |
+
When you convey a covered work, you waive any legal power to forbid
|
188 |
+
circumvention of technological measures to the extent such circumvention
|
189 |
+
is effected by exercising rights under this License with respect to
|
190 |
+
the covered work, and you disclaim any intention to limit operation or
|
191 |
+
modification of the work as a means of enforcing, against the work's
|
192 |
+
users, your or third parties' legal rights to forbid circumvention of
|
193 |
+
technological measures.
|
194 |
+
|
195 |
+
4. Conveying Verbatim Copies.
|
196 |
+
|
197 |
+
You may convey verbatim copies of the Program's source code as you
|
198 |
+
receive it, in any medium, provided that you conspicuously and
|
199 |
+
appropriately publish on each copy an appropriate copyright notice;
|
200 |
+
keep intact all notices stating that this License and any
|
201 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
202 |
+
keep intact all notices of the absence of any warranty; and give all
|
203 |
+
recipients a copy of this License along with the Program.
|
204 |
+
|
205 |
+
You may charge any price or no price for each copy that you convey,
|
206 |
+
and you may offer support or warranty protection for a fee.
|
207 |
+
|
208 |
+
5. Conveying Modified Source Versions.
|
209 |
+
|
210 |
+
You may convey a work based on the Program, or the modifications to
|
211 |
+
produce it from the Program, in the form of source code under the
|
212 |
+
terms of section 4, provided that you also meet all of these conditions:
|
213 |
+
|
214 |
+
a) The work must carry prominent notices stating that you modified
|
215 |
+
it, and giving a relevant date.
|
216 |
+
|
217 |
+
b) The work must carry prominent notices stating that it is
|
218 |
+
released under this License and any conditions added under section
|
219 |
+
7. This requirement modifies the requirement in section 4 to
|
220 |
+
"keep intact all notices".
|
221 |
+
|
222 |
+
c) You must license the entire work, as a whole, under this
|
223 |
+
License to anyone who comes into possession of a copy. This
|
224 |
+
License will therefore apply, along with any applicable section 7
|
225 |
+
additional terms, to the whole of the work, and all its parts,
|
226 |
+
regardless of how they are packaged. This License gives no
|
227 |
+
permission to license the work in any other way, but it does not
|
228 |
+
invalidate such permission if you have separately received it.
|
229 |
+
|
230 |
+
d) If the work has interactive user interfaces, each must display
|
231 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
232 |
+
interfaces that do not display Appropriate Legal Notices, your
|
233 |
+
work need not make them do so.
|
234 |
+
|
235 |
+
A compilation of a covered work with other separate and independent
|
236 |
+
works, which are not by their nature extensions of the covered work,
|
237 |
+
and which are not combined with it such as to form a larger program,
|
238 |
+
in or on a volume of a storage or distribution medium, is called an
|
239 |
+
"aggregate" if the compilation and its resulting copyright are not
|
240 |
+
used to limit the access or legal rights of the compilation's users
|
241 |
+
beyond what the individual works permit. Inclusion of a covered work
|
242 |
+
in an aggregate does not cause this License to apply to the other
|
243 |
+
parts of the aggregate.
|
244 |
+
|
245 |
+
6. Conveying Non-Source Forms.
|
246 |
+
|
247 |
+
You may convey a covered work in object code form under the terms
|
248 |
+
of sections 4 and 5, provided that you also convey the
|
249 |
+
machine-readable Corresponding Source under the terms of this License,
|
250 |
+
in one of these ways:
|
251 |
+
|
252 |
+
a) Convey the object code in, or embodied in, a physical product
|
253 |
+
(including a physical distribution medium), accompanied by the
|
254 |
+
Corresponding Source fixed on a durable physical medium
|
255 |
+
customarily used for software interchange.
|
256 |
+
|
257 |
+
b) Convey the object code in, or embodied in, a physical product
|
258 |
+
(including a physical distribution medium), accompanied by a
|
259 |
+
written offer, valid for at least three years and valid for as
|
260 |
+
long as you offer spare parts or customer support for that product
|
261 |
+
model, to give anyone who possesses the object code either (1) a
|
262 |
+
copy of the Corresponding Source for all the software in the
|
263 |
+
product that is covered by this License, on a durable physical
|
264 |
+
medium customarily used for software interchange, for a price no
|
265 |
+
more than your reasonable cost of physically performing this
|
266 |
+
conveying of source, or (2) access to copy the
|
267 |
+
Corresponding Source from a network server at no charge.
|
268 |
+
|
269 |
+
c) Convey individual copies of the object code with a copy of the
|
270 |
+
written offer to provide the Corresponding Source. This
|
271 |
+
alternative is allowed only occasionally and noncommercially, and
|
272 |
+
only if you received the object code with such an offer, in accord
|
273 |
+
with subsection 6b.
|
274 |
+
|
275 |
+
d) Convey the object code by offering access from a designated
|
276 |
+
place (gratis or for a charge), and offer equivalent access to the
|
277 |
+
Corresponding Source in the same way through the same place at no
|
278 |
+
further charge. You need not require recipients to copy the
|
279 |
+
Corresponding Source along with the object code. If the place to
|
280 |
+
copy the object code is a network server, the Corresponding Source
|
281 |
+
may be on a different server (operated by you or a third party)
|
282 |
+
that supports equivalent copying facilities, provided you maintain
|
283 |
+
clear directions next to the object code saying where to find the
|
284 |
+
Corresponding Source. Regardless of what server hosts the
|
285 |
+
Corresponding Source, you remain obligated to ensure that it is
|
286 |
+
available for as long as needed to satisfy these requirements.
|
287 |
+
|
288 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
289 |
+
you inform other peers where the object code and Corresponding
|
290 |
+
Source of the work are being offered to the general public at no
|
291 |
+
charge under subsection 6d.
|
292 |
+
|
293 |
+
A separable portion of the object code, whose source code is excluded
|
294 |
+
from the Corresponding Source as a System Library, need not be
|
295 |
+
included in conveying the object code work.
|
296 |
+
|
297 |
+
A "User Product" is either (1) a "consumer product", which means any
|
298 |
+
tangible personal property which is normally used for personal, family,
|
299 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
300 |
+
into a dwelling. In determining whether a product is a consumer product,
|
301 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
302 |
+
product received by a particular user, "normally used" refers to a
|
303 |
+
typical or common use of that class of product, regardless of the status
|
304 |
+
of the particular user or of the way in which the particular user
|
305 |
+
actually uses, or expects or is expected to use, the product. A product
|
306 |
+
is a consumer product regardless of whether the product has substantial
|
307 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
308 |
+
the only significant mode of use of the product.
|
309 |
+
|
310 |
+
"Installation Information" for a User Product means any methods,
|
311 |
+
procedures, authorization keys, or other information required to install
|
312 |
+
and execute modified versions of a covered work in that User Product from
|
313 |
+
a modified version of its Corresponding Source. The information must
|
314 |
+
suffice to ensure that the continued functioning of the modified object
|
315 |
+
code is in no case prevented or interfered with solely because
|
316 |
+
modification has been made.
|
317 |
+
|
318 |
+
If you convey an object code work under this section in, or with, or
|
319 |
+
specifically for use in, a User Product, and the conveying occurs as
|
320 |
+
part of a transaction in which the right of possession and use of the
|
321 |
+
User Product is transferred to the recipient in perpetuity or for a
|
322 |
+
fixed term (regardless of how the transaction is characterized), the
|
323 |
+
Corresponding Source conveyed under this section must be accompanied
|
324 |
+
by the Installation Information. But this requirement does not apply
|
325 |
+
if neither you nor any third party retains the ability to install
|
326 |
+
modified object code on the User Product (for example, the work has
|
327 |
+
been installed in ROM).
|
328 |
+
|
329 |
+
The requirement to provide Installation Information does not include a
|
330 |
+
requirement to continue to provide support service, warranty, or updates
|
331 |
+
for a work that has been modified or installed by the recipient, or for
|
332 |
+
the User Product in which it has been modified or installed. Access to a
|
333 |
+
network may be denied when the modification itself materially and
|
334 |
+
adversely affects the operation of the network or violates the rules and
|
335 |
+
protocols for communication across the network.
|
336 |
+
|
337 |
+
Corresponding Source conveyed, and Installation Information provided,
|
338 |
+
in accord with this section must be in a format that is publicly
|
339 |
+
documented (and with an implementation available to the public in
|
340 |
+
source code form), and must require no special password or key for
|
341 |
+
unpacking, reading or copying.
|
342 |
+
|
343 |
+
7. Additional Terms.
|
344 |
+
|
345 |
+
"Additional permissions" are terms that supplement the terms of this
|
346 |
+
License by making exceptions from one or more of its conditions.
|
347 |
+
Additional permissions that are applicable to the entire Program shall
|
348 |
+
be treated as though they were included in this License, to the extent
|
349 |
+
that they are valid under applicable law. If additional permissions
|
350 |
+
apply only to part of the Program, that part may be used separately
|
351 |
+
under those permissions, but the entire Program remains governed by
|
352 |
+
this License without regard to the additional permissions.
|
353 |
+
|
354 |
+
When you convey a copy of a covered work, you may at your option
|
355 |
+
remove any additional permissions from that copy, or from any part of
|
356 |
+
it. (Additional permissions may be written to require their own
|
357 |
+
removal in certain cases when you modify the work.) You may place
|
358 |
+
additional permissions on material, added by you to a covered work,
|
359 |
+
for which you have or can give appropriate copyright permission.
|
360 |
+
|
361 |
+
Notwithstanding any other provision of this License, for material you
|
362 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
363 |
+
that material) supplement the terms of this License with terms:
|
364 |
+
|
365 |
+
a) Disclaiming warranty or limiting liability differently from the
|
366 |
+
terms of sections 15 and 16 of this License; or
|
367 |
+
|
368 |
+
b) Requiring preservation of specified reasonable legal notices or
|
369 |
+
author attributions in that material or in the Appropriate Legal
|
370 |
+
Notices displayed by works containing it; or
|
371 |
+
|
372 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
373 |
+
requiring that modified versions of such material be marked in
|
374 |
+
reasonable ways as different from the original version; or
|
375 |
+
|
376 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
377 |
+
authors of the material; or
|
378 |
+
|
379 |
+
e) Declining to grant rights under trademark law for use of some
|
380 |
+
trade names, trademarks, or service marks; or
|
381 |
+
|
382 |
+
f) Requiring indemnification of licensors and authors of that
|
383 |
+
material by anyone who conveys the material (or modified versions of
|
384 |
+
it) with contractual assumptions of liability to the recipient, for
|
385 |
+
any liability that these contractual assumptions directly impose on
|
386 |
+
those licensors and authors.
|
387 |
+
|
388 |
+
All other non-permissive additional terms are considered "further
|
389 |
+
restrictions" within the meaning of section 10. If the Program as you
|
390 |
+
received it, or any part of it, contains a notice stating that it is
|
391 |
+
governed by this License along with a term that is a further
|
392 |
+
restriction, you may remove that term. If a license document contains
|
393 |
+
a further restriction but permits relicensing or conveying under this
|
394 |
+
License, you may add to a covered work material governed by the terms
|
395 |
+
of that license document, provided that the further restriction does
|
396 |
+
not survive such relicensing or conveying.
|
397 |
+
|
398 |
+
If you add terms to a covered work in accord with this section, you
|
399 |
+
must place, in the relevant source files, a statement of the
|
400 |
+
additional terms that apply to those files, or a notice indicating
|
401 |
+
where to find the applicable terms.
|
402 |
+
|
403 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
404 |
+
form of a separately written license, or stated as exceptions;
|
405 |
+
the above requirements apply either way.
|
406 |
+
|
407 |
+
8. Termination.
|
408 |
+
|
409 |
+
You may not propagate or modify a covered work except as expressly
|
410 |
+
provided under this License. Any attempt otherwise to propagate or
|
411 |
+
modify it is void, and will automatically terminate your rights under
|
412 |
+
this License (including any patent licenses granted under the third
|
413 |
+
paragraph of section 11).
|
414 |
+
|
415 |
+
However, if you cease all violation of this License, then your
|
416 |
+
license from a particular copyright holder is reinstated (a)
|
417 |
+
provisionally, unless and until the copyright holder explicitly and
|
418 |
+
finally terminates your license, and (b) permanently, if the copyright
|
419 |
+
holder fails to notify you of the violation by some reasonable means
|
420 |
+
prior to 60 days after the cessation.
|
421 |
+
|
422 |
+
Moreover, your license from a particular copyright holder is
|
423 |
+
reinstated permanently if the copyright holder notifies you of the
|
424 |
+
violation by some reasonable means, this is the first time you have
|
425 |
+
received notice of violation of this License (for any work) from that
|
426 |
+
copyright holder, and you cure the violation prior to 30 days after
|
427 |
+
your receipt of the notice.
|
428 |
+
|
429 |
+
Termination of your rights under this section does not terminate the
|
430 |
+
licenses of parties who have received copies or rights from you under
|
431 |
+
this License. If your rights have been terminated and not permanently
|
432 |
+
reinstated, you do not qualify to receive new licenses for the same
|
433 |
+
material under section 10.
|
434 |
+
|
435 |
+
9. Acceptance Not Required for Having Copies.
|
436 |
+
|
437 |
+
You are not required to accept this License in order to receive or
|
438 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
439 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
440 |
+
to receive a copy likewise does not require acceptance. However,
|
441 |
+
nothing other than this License grants you permission to propagate or
|
442 |
+
modify any covered work. These actions infringe copyright if you do
|
443 |
+
not accept this License. Therefore, by modifying or propagating a
|
444 |
+
covered work, you indicate your acceptance of this License to do so.
|
445 |
+
|
446 |
+
10. Automatic Licensing of Downstream Recipients.
|
447 |
+
|
448 |
+
Each time you convey a covered work, the recipient automatically
|
449 |
+
receives a license from the original licensors, to run, modify and
|
450 |
+
propagate that work, subject to this License. You are not responsible
|
451 |
+
for enforcing compliance by third parties with this License.
|
452 |
+
|
453 |
+
An "entity transaction" is a transaction transferring control of an
|
454 |
+
organization, or substantially all assets of one, or subdividing an
|
455 |
+
organization, or merging organizations. If propagation of a covered
|
456 |
+
work results from an entity transaction, each party to that
|
457 |
+
transaction who receives a copy of the work also receives whatever
|
458 |
+
licenses to the work the party's predecessor in interest had or could
|
459 |
+
give under the previous paragraph, plus a right to possession of the
|
460 |
+
Corresponding Source of the work from the predecessor in interest, if
|
461 |
+
the predecessor has it or can get it with reasonable efforts.
|
462 |
+
|
463 |
+
You may not impose any further restrictions on the exercise of the
|
464 |
+
rights granted or affirmed under this License. For example, you may
|
465 |
+
not impose a license fee, royalty, or other charge for exercise of
|
466 |
+
rights granted under this License, and you may not initiate litigation
|
467 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
468 |
+
any patent claim is infringed by making, using, selling, offering for
|
469 |
+
sale, or importing the Program or any portion of it.
|
470 |
+
|
471 |
+
11. Patents.
|
472 |
+
|
473 |
+
A "contributor" is a copyright holder who authorizes use under this
|
474 |
+
License of the Program or a work on which the Program is based. The
|
475 |
+
work thus licensed is called the contributor's "contributor version".
|
476 |
+
|
477 |
+
A contributor's "essential patent claims" are all patent claims
|
478 |
+
owned or controlled by the contributor, whether already acquired or
|
479 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
480 |
+
by this License, of making, using, or selling its contributor version,
|
481 |
+
but do not include claims that would be infringed only as a
|
482 |
+
consequence of further modification of the contributor version. For
|
483 |
+
purposes of this definition, "control" includes the right to grant
|
484 |
+
patent sublicenses in a manner consistent with the requirements of
|
485 |
+
this License.
|
486 |
+
|
487 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
488 |
+
patent license under the contributor's essential patent claims, to
|
489 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
490 |
+
propagate the contents of its contributor version.
|
491 |
+
|
492 |
+
In the following three paragraphs, a "patent license" is any express
|
493 |
+
agreement or commitment, however denominated, not to enforce a patent
|
494 |
+
(such as an express permission to practice a patent or covenant not to
|
495 |
+
sue for patent infringement). To "grant" such a patent license to a
|
496 |
+
party means to make such an agreement or commitment not to enforce a
|
497 |
+
patent against the party.
|
498 |
+
|
499 |
+
If you convey a covered work, knowingly relying on a patent license,
|
500 |
+
and the Corresponding Source of the work is not available for anyone
|
501 |
+
to copy, free of charge and under the terms of this License, through a
|
502 |
+
publicly available network server or other readily accessible means,
|
503 |
+
then you must either (1) cause the Corresponding Source to be so
|
504 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
505 |
+
patent license for this particular work, or (3) arrange, in a manner
|
506 |
+
consistent with the requirements of this License, to extend the patent
|
507 |
+
license to downstream recipients. "Knowingly relying" means you have
|
508 |
+
actual knowledge that, but for the patent license, your conveying the
|
509 |
+
covered work in a country, or your recipient's use of the covered work
|
510 |
+
in a country, would infringe one or more identifiable patents in that
|
511 |
+
country that you have reason to believe are valid.
|
512 |
+
|
513 |
+
If, pursuant to or in connection with a single transaction or
|
514 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
515 |
+
covered work, and grant a patent license to some of the parties
|
516 |
+
receiving the covered work authorizing them to use, propagate, modify
|
517 |
+
or convey a specific copy of the covered work, then the patent license
|
518 |
+
you grant is automatically extended to all recipients of the covered
|
519 |
+
work and works based on it.
|
520 |
+
|
521 |
+
A patent license is "discriminatory" if it does not include within
|
522 |
+
the scope of its coverage, prohibits the exercise of, or is
|
523 |
+
conditioned on the non-exercise of one or more of the rights that are
|
524 |
+
specifically granted under this License. You may not convey a covered
|
525 |
+
work if you are a party to an arrangement with a third party that is
|
526 |
+
in the business of distributing software, under which you make payment
|
527 |
+
to the third party based on the extent of your activity of conveying
|
528 |
+
the work, and under which the third party grants, to any of the
|
529 |
+
parties who would receive the covered work from you, a discriminatory
|
530 |
+
patent license (a) in connection with copies of the covered work
|
531 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
532 |
+
for and in connection with specific products or compilations that
|
533 |
+
contain the covered work, unless you entered into that arrangement,
|
534 |
+
or that patent license was granted, prior to 28 March 2007.
|
535 |
+
|
536 |
+
Nothing in this License shall be construed as excluding or limiting
|
537 |
+
any implied license or other defenses to infringement that may
|
538 |
+
otherwise be available to you under applicable patent law.
|
539 |
+
|
540 |
+
12. No Surrender of Others' Freedom.
|
541 |
+
|
542 |
+
If conditions are imposed on you (whether by court order, agreement or
|
543 |
+
otherwise) that contradict the conditions of this License, they do not
|
544 |
+
excuse you from the conditions of this License. If you cannot convey a
|
545 |
+
covered work so as to satisfy simultaneously your obligations under this
|
546 |
+
License and any other pertinent obligations, then as a consequence you may
|
547 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
548 |
+
to collect a royalty for further conveying from those to whom you convey
|
549 |
+
the Program, the only way you could satisfy both those terms and this
|
550 |
+
License would be to refrain entirely from conveying the Program.
|
551 |
+
|
552 |
+
13. Use with the GNU Affero General Public License.
|
553 |
+
|
554 |
+
Notwithstanding any other provision of this License, you have
|
555 |
+
permission to link or combine any covered work with a work licensed
|
556 |
+
under version 3 of the GNU Affero General Public License into a single
|
557 |
+
combined work, and to convey the resulting work. The terms of this
|
558 |
+
License will continue to apply to the part which is the covered work,
|
559 |
+
but the special requirements of the GNU Affero General Public License,
|
560 |
+
section 13, concerning interaction through a network will apply to the
|
561 |
+
combination as such.
|
562 |
+
|
563 |
+
14. Revised Versions of this License.
|
564 |
+
|
565 |
+
The Free Software Foundation may publish revised and/or new versions of
|
566 |
+
the GNU General Public License from time to time. Such new versions will
|
567 |
+
be similar in spirit to the present version, but may differ in detail to
|
568 |
+
address new problems or concerns.
|
569 |
+
|
570 |
+
Each version is given a distinguishing version number. If the
|
571 |
+
Program specifies that a certain numbered version of the GNU General
|
572 |
+
Public License "or any later version" applies to it, you have the
|
573 |
+
option of following the terms and conditions either of that numbered
|
574 |
+
version or of any later version published by the Free Software
|
575 |
+
Foundation. If the Program does not specify a version number of the
|
576 |
+
GNU General Public License, you may choose any version ever published
|
577 |
+
by the Free Software Foundation.
|
578 |
+
|
579 |
+
If the Program specifies that a proxy can decide which future
|
580 |
+
versions of the GNU General Public License can be used, that proxy's
|
581 |
+
public statement of acceptance of a version permanently authorizes you
|
582 |
+
to choose that version for the Program.
|
583 |
+
|
584 |
+
Later license versions may give you additional or different
|
585 |
+
permissions. However, no additional obligations are imposed on any
|
586 |
+
author or copyright holder as a result of your choosing to follow a
|
587 |
+
later version.
|
588 |
+
|
589 |
+
15. Disclaimer of Warranty.
|
590 |
+
|
591 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
592 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
593 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
594 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
595 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
596 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
597 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
598 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
599 |
+
|
600 |
+
16. Limitation of Liability.
|
601 |
+
|
602 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
603 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
604 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
605 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
606 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
607 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
608 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
609 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
610 |
+
SUCH DAMAGES.
|
611 |
+
|
612 |
+
17. Interpretation of Sections 15 and 16.
|
613 |
+
|
614 |
+
If the disclaimer of warranty and limitation of liability provided
|
615 |
+
above cannot be given local legal effect according to their terms,
|
616 |
+
reviewing courts shall apply local law that most closely approximates
|
617 |
+
an absolute waiver of all civil liability in connection with the
|
618 |
+
Program, unless a warranty or assumption of liability accompanies a
|
619 |
+
copy of the Program in return for a fee.
|
620 |
+
|
621 |
+
END OF TERMS AND CONDITIONS
|
622 |
+
|
623 |
+
How to Apply These Terms to Your New Programs
|
624 |
+
|
625 |
+
If you develop a new program, and you want it to be of the greatest
|
626 |
+
possible use to the public, the best way to achieve this is to make it
|
627 |
+
free software which everyone can redistribute and change under these terms.
|
628 |
+
|
629 |
+
To do so, attach the following notices to the program. It is safest
|
630 |
+
to attach them to the start of each source file to most effectively
|
631 |
+
state the exclusion of warranty; and each file should have at least
|
632 |
+
the "copyright" line and a pointer to where the full notice is found.
|
633 |
+
|
634 |
+
<one line to give the program's name and a brief idea of what it does.>
|
635 |
+
Copyright (C) <year> <name of author>
|
636 |
+
|
637 |
+
This program is free software: you can redistribute it and/or modify
|
638 |
+
it under the terms of the GNU General Public License as published by
|
639 |
+
the Free Software Foundation, either version 3 of the License, or
|
640 |
+
(at your option) any later version.
|
641 |
+
|
642 |
+
This program is distributed in the hope that it will be useful,
|
643 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
644 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
645 |
+
GNU General Public License for more details.
|
646 |
+
|
647 |
+
You should have received a copy of the GNU General Public License
|
648 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
649 |
+
|
650 |
+
Also add information on how to contact you by electronic and paper mail.
|
651 |
+
|
652 |
+
If the program does terminal interaction, make it output a short
|
653 |
+
notice like this when it starts in an interactive mode:
|
654 |
+
|
655 |
+
<program> Copyright (C) <year> <name of author>
|
656 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
657 |
+
This is free software, and you are welcome to redistribute it
|
658 |
+
under certain conditions; type `show c' for details.
|
659 |
+
|
660 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
661 |
+
parts of the General Public License. Of course, your program's commands
|
662 |
+
might be different; for a GUI interface, you would use an "about box".
|
663 |
+
|
664 |
+
You should also get your employer (if you work as a programmer) or school,
|
665 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
666 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
667 |
+
<https://www.gnu.org/licenses/>.
|
668 |
+
|
669 |
+
The GNU General Public License does not permit incorporating your program
|
670 |
+
into proprietary programs. If your program is a subroutine library, you
|
671 |
+
may consider it more useful to permit linking proprietary applications with
|
672 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
673 |
+
Public License instead of this License. But first, please read
|
674 |
+
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
README.md
CHANGED
@@ -1,13 +1,267 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Auto-Research
|
2 |
+
![Auto-Research][logo]
|
3 |
+
|
4 |
+
[logo]: https://github.com/sidphbot/Auto-Research/blob/main/logo.png
|
5 |
+
A no-code utility to generate a detailed well-cited survey with topic clustered sections (draft paper format) and other interesting artifacts from a single research query.
|
6 |
+
|
7 |
+
Data Provider: [arXiv](https://arxiv.org/) Open Archive Initiative OAI
|
8 |
+
|
9 |
+
Requirements:
|
10 |
+
- python 3.7 or above
|
11 |
+
- poppler-utils - `sudo apt-get install build-essential libpoppler-cpp-dev pkg-config python-dev`
|
12 |
+
- list of requirements in requirements.txt - `cat requirements.txt | xargs pip install`
|
13 |
+
- 8GB disk space
|
14 |
+
- 13GB CUDA(GPU) memory - for a survey of 100 searched papers(max_search) and 25 selected papers(num_papers)
|
15 |
+
|
16 |
+
#### Demo :
|
17 |
+
|
18 |
+
Video Demo : https://drive.google.com/file/d/1-77J2L10lsW-bFDOGdTaPzSr_utY743g/view?usp=sharing
|
19 |
+
|
20 |
+
Kaggle Re-usable Demo : https://www.kaggle.com/sidharthpal/auto-research-generate-survey-from-query
|
21 |
+
|
22 |
+
(`[TIP]` click 'edit and run' to run the demo for your custom queries on a free GPU)
|
23 |
+
|
24 |
+
|
25 |
+
#### Steps to run (pip coming soon):
|
26 |
+
```
|
27 |
+
apt install -y poppler-utils libpoppler-cpp-dev
|
28 |
+
git clone https://github.com/sidphbot/Auto-Research.git
|
29 |
+
|
30 |
+
cd Auto-Research/
|
31 |
+
pip install -r requirements.txt
|
32 |
+
python survey.py [options] <your_research_query>
|
33 |
+
```
|
34 |
+
|
35 |
+
#### Artifacts generated (zipped):
|
36 |
+
- Detailed survey draft paper as txt file
|
37 |
+
- A curated list of top 25+ papers as pdfs and txts
|
38 |
+
- Images extracted from above papers as jpegs, bmps etc
|
39 |
+
- Heading/Section wise highlights extracted from above papers as a re-usable pure python joblib dump
|
40 |
+
- Tables extracted from papers(optional)
|
41 |
+
- Corpus of metadata highlights/text of top 100 papers as a re-usable pure python joblib dump
|
42 |
+
|
43 |
+
## Example run #1 - python utility
|
44 |
+
|
45 |
+
```
|
46 |
+
python survey.py 'multi-task representation learning'
|
47 |
+
```
|
48 |
+
|
49 |
+
## Example run #2 - python class
|
50 |
+
|
51 |
+
```
|
52 |
+
from survey import Surveyor
|
53 |
+
mysurveyor = Surveyor()
|
54 |
+
mysurveyor.survey('quantum entanglement')
|
55 |
+
```
|
56 |
+
|
57 |
+
### Research tools:
|
58 |
+
|
59 |
+
These are independent tools for your research or document text handling needs.
|
60 |
+
|
61 |
+
```
|
62 |
+
*[Tip]* :(models can be changed in defaults or passed on during init along with `refresh-models=True`)
|
63 |
+
```
|
64 |
+
|
65 |
+
- `abstractive_summary` - takes a long text document (`string`) and returns a 1-paragraph abstract or “abstractive” summary (`string`)
|
66 |
+
|
67 |
+
Input:
|
68 |
+
|
69 |
+
`longtext` : string
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
|
73 |
+
`summary` : string
|
74 |
+
|
75 |
+
- `extractive_summary` - takes a long text document (`string`) and returns a 1-paragraph of extracted highlights or “extractive” summary (`string`)
|
76 |
+
|
77 |
+
Input:
|
78 |
+
|
79 |
+
`longtext` : string
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
|
83 |
+
`summary` : string
|
84 |
+
|
85 |
+
- `generate_title` - takes a long text document (`string`) and returns a generated title (`string`)
|
86 |
+
|
87 |
+
Input:
|
88 |
+
|
89 |
+
`longtext` : string
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
|
93 |
+
`title` : string
|
94 |
+
|
95 |
+
- `extractive_highlights` - takes a long text document (`string`) and returns a list of extracted highlights (`[string]`), a list of keywords (`[string]`) and key phrases (`[string]`)
|
96 |
+
|
97 |
+
Input:
|
98 |
+
|
99 |
+
`longtext` : string
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
|
103 |
+
`highlights` : [string]
|
104 |
+
`keywords` : [string]
|
105 |
+
`keyphrases` : [string]
|
106 |
+
|
107 |
+
- `extract_images_from_file` - takes a pdf file name (`string`) and returns a list of image filenames (`[string]`).
|
108 |
+
|
109 |
+
Input:
|
110 |
+
|
111 |
+
`pdf_file` : string
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
|
115 |
+
`images_files` : [string]
|
116 |
+
|
117 |
+
- `extract_tables_from_file` - takes a pdf file name (`string`) and returns a list of csv filenames (`[string]`).
|
118 |
+
|
119 |
+
Input:
|
120 |
+
|
121 |
+
`pdf_file` : string
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
|
125 |
+
`images_files` : [string]
|
126 |
+
|
127 |
+
- `cluster_lines` - takes a list of lines (`string`) and returns the topic-clustered sections (`dict(generated_title: [cluster_abstract])`) and clustered lines (`dict(cluster_id: [cluster_lines])`)
|
128 |
+
|
129 |
+
Input:
|
130 |
+
|
131 |
+
`lines` : [string]
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
|
135 |
+
`sections` : dict(generated_title: [cluster_abstract])
|
136 |
+
`clusters` : dict(cluster_id: [cluster_lines])
|
137 |
+
|
138 |
+
- `extract_headings` - *[for scientific texts - Assumes an ‘abstract’ heading present]* takes a text file name (`string`) and returns a list of headings (`[string]`) and refined lines (`[string]`).
|
139 |
+
|
140 |
+
`[Tip 1]` : Use `extract_sections` as a wrapper (e.g. `extract_sections(extract_headings(“/path/to/textfile”)`) to get heading-wise sectioned text with refined lines instead (`dict( heading: text)`)
|
141 |
+
|
142 |
+
`[Tip 2]` : write the word ‘abstract’ at the start of the file text to get an extraction for non-scientific texts as well !!
|
143 |
+
|
144 |
+
Input:
|
145 |
+
|
146 |
+
`text_file` : string
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
|
150 |
+
`refined` : [string],
|
151 |
+
`headings` : [string]
|
152 |
+
`sectioned_doc` : dict( heading: text) (Optional - Wrapper case)
|
153 |
+
|
154 |
+
|
155 |
+
## Access/Modify defaults:
|
156 |
+
|
157 |
+
- inside code
|
158 |
+
```
|
159 |
+
from survey.Surveyor import DEFAULTS
|
160 |
+
from pprint import pprint
|
161 |
+
|
162 |
+
pprint(DEFAULTS)
|
163 |
+
```
|
164 |
+
or,
|
165 |
+
|
166 |
+
- Modify static config file - `defaults.py`
|
167 |
+
|
168 |
+
or,
|
169 |
+
|
170 |
+
- At runtime (utility)
|
171 |
+
|
172 |
+
```
|
173 |
+
python survey.py --help
|
174 |
+
```
|
175 |
+
```
|
176 |
+
usage: survey.py [-h] [--max_search max_metadata_papers]
|
177 |
+
[--num_papers max_num_papers] [--pdf_dir pdf_dir]
|
178 |
+
[--txt_dir txt_dir] [--img_dir img_dir] [--tab_dir tab_dir]
|
179 |
+
[--dump_dir dump_dir] [--models_dir save_models_dir]
|
180 |
+
[--title_model_name title_model_name]
|
181 |
+
[--ex_summ_model_name extractive_summ_model_name]
|
182 |
+
[--ledmodel_name ledmodel_name]
|
183 |
+
[--embedder_name sentence_embedder_name]
|
184 |
+
[--nlp_name spacy_model_name]
|
185 |
+
[--similarity_nlp_name similarity_nlp_name]
|
186 |
+
[--kw_model_name kw_model_name]
|
187 |
+
[--refresh_models refresh_models] [--high_gpu high_gpu]
|
188 |
+
query_string
|
189 |
+
|
190 |
+
Generate a survey just from a query !!
|
191 |
+
|
192 |
+
positional arguments:
|
193 |
+
query_string your research query/keywords
|
194 |
+
|
195 |
+
optional arguments:
|
196 |
+
-h, --help show this help message and exit
|
197 |
+
--max_search max_metadata_papers
|
198 |
+
maximium number of papers to gaze at - defaults to 100
|
199 |
+
--num_papers max_num_papers
|
200 |
+
maximium number of papers to download and analyse -
|
201 |
+
defaults to 25
|
202 |
+
--pdf_dir pdf_dir pdf paper storage directory - defaults to
|
203 |
+
arxiv_data/tarpdfs/
|
204 |
+
--txt_dir txt_dir text-converted paper storage directory - defaults to
|
205 |
+
arxiv_data/fulltext/
|
206 |
+
--img_dir img_dir image storage directory - defaults to
|
207 |
+
arxiv_data/images/
|
208 |
+
--tab_dir tab_dir tables storage directory - defaults to
|
209 |
+
arxiv_data/tables/
|
210 |
+
--dump_dir dump_dir all_output_dir - defaults to arxiv_dumps/
|
211 |
+
--models_dir save_models_dir
|
212 |
+
directory to save models (> 5GB) - defaults to
|
213 |
+
saved_models/
|
214 |
+
--title_model_name title_model_name
|
215 |
+
title model name/tag in hugging-face, defaults to
|
216 |
+
'Callidior/bert2bert-base-arxiv-titlegen'
|
217 |
+
--ex_summ_model_name extractive_summ_model_name
|
218 |
+
extractive summary model name/tag in hugging-face,
|
219 |
+
defaults to 'allenai/scibert_scivocab_uncased'
|
220 |
+
--ledmodel_name ledmodel_name
|
221 |
+
led model(for abstractive summary) name/tag in
|
222 |
+
hugging-face, defaults to 'allenai/led-
|
223 |
+
large-16384-arxiv'
|
224 |
+
--embedder_name sentence_embedder_name
|
225 |
+
sentence embedder name/tag in hugging-face, defaults
|
226 |
+
to 'paraphrase-MiniLM-L6-v2'
|
227 |
+
--nlp_name spacy_model_name
|
228 |
+
spacy model name/tag in hugging-face (if changed -
|
229 |
+
needs to be spacy-installed prior), defaults to
|
230 |
+
'en_core_sci_scibert'
|
231 |
+
--similarity_nlp_name similarity_nlp_name
|
232 |
+
spacy downstream model(for similarity) name/tag in
|
233 |
+
hugging-face (if changed - needs to be spacy-installed
|
234 |
+
prior), defaults to 'en_core_sci_lg'
|
235 |
+
--kw_model_name kw_model_name
|
236 |
+
keyword extraction model name/tag in hugging-face,
|
237 |
+
defaults to 'distilbert-base-nli-mean-tokens'
|
238 |
+
--refresh_models refresh_models
|
239 |
+
Refresh model downloads with given names (needs
|
240 |
+
atleast one model name param above), defaults to False
|
241 |
+
--high_gpu high_gpu High GPU usage permitted, defaults to False
|
242 |
+
|
243 |
+
```
|
244 |
+
|
245 |
+
- At runtime (code)
|
246 |
+
|
247 |
+
> during surveyor object initialization with `surveyor_obj = Surveyor()`
|
248 |
+
- `pdf_dir`: String, pdf paper storage directory - defaults to `arxiv_data/tarpdfs/`
|
249 |
+
- `txt_dir`: String, text-converted paper storage directory - defaults to `arxiv_data/fulltext/`
|
250 |
+
- `img_dir`: String, image image storage directory - defaults to `arxiv_data/images/`
|
251 |
+
- `tab_dir`: String, tables storage directory - defaults to `arxiv_data/tables/`
|
252 |
+
- `dump_dir`: String, all_output_dir - defaults to `arxiv_dumps/`
|
253 |
+
- `models_dir`: String, directory to save to huge models, defaults to `saved_models/`
|
254 |
+
- `title_model_name`: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
255 |
+
- `ex_summ_model_name`: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
256 |
+
- `ledmodel_name`: String, led model(for abstractive summary) name/tag in hugging-face, defaults to `allenai/led-large-16384-arxiv`
|
257 |
+
- `embedder_name`: String, sentence embedder name/tag in hugging-face, defaults to `paraphrase-MiniLM-L6-v2`
|
258 |
+
- `nlp_name`: String, spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to `en_core_sci_scibert`
|
259 |
+
- `similarity_nlp_name`: String, spacy downstream trained model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to `en_core_sci_lg`
|
260 |
+
- `kw_model_name`: String, keyword extraction model name/tag in hugging-face, defaults to `distilbert-base-nli-mean-tokens`
|
261 |
+
- `high_gpu`: Bool, High GPU usage permitted, defaults to `False`
|
262 |
+
- `refresh_models`: Bool, Refresh model downloads with given names (needs atleast one model name param above), defaults to False
|
263 |
+
|
264 |
+
> during survey generation with `surveyor_obj.survey(query="my_research_query")`
|
265 |
+
- `max_search`: int maximium number of papers to gaze at - defaults to `100`
|
266 |
+
- `num_papers`: int maximium number of papers to download and analyse - defaults to `25`
|
267 |
+
|
app.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
#from src.Surveyor import Surveyor
|
6 |
+
|
7 |
+
def run_survey(surveyor, research_keywords, max_search, num_papers):
|
8 |
+
zip_file_name, survey_file_name = surveyor.survey(research_keywords,
|
9 |
+
max_search=max_search,
|
10 |
+
num_papers=num_papers
|
11 |
+
)
|
12 |
+
|
13 |
+
with open(str(zip_file_name), "rb") as file:
|
14 |
+
btn = st.download_button(
|
15 |
+
label="Download extracted topic-clustered-highlights, images and tables as zip",
|
16 |
+
data=file,
|
17 |
+
file_name=str(zip_file_name)
|
18 |
+
)
|
19 |
+
|
20 |
+
with open(str(survey_file_name), "rb") as file:
|
21 |
+
btn = st.download_button(
|
22 |
+
label="Download detailed generated survey file",
|
23 |
+
data=file,
|
24 |
+
file_name=str(zip_file_name)
|
25 |
+
)
|
26 |
+
|
27 |
+
with open(str(survey_file_name), "rb") as file:
|
28 |
+
btn = st.download_button(
|
29 |
+
label="Download detailed generated survey file",
|
30 |
+
data=file,
|
31 |
+
file_name=str(zip_file_name)
|
32 |
+
)
|
33 |
+
st.write(file.readlines())
|
34 |
+
|
35 |
+
|
36 |
+
def survey_space():
|
37 |
+
|
38 |
+
st.title('Automated Survey generation from research keywords - Auto-Research V0.1')
|
39 |
+
|
40 |
+
form = st.sidebar.form(key='survey_form')
|
41 |
+
research_keywords = form.text_input("What would you like to research in today?")
|
42 |
+
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
43 |
+
min_value=1, max_value=60, value=20, step=1, key='max_search')
|
44 |
+
num_papers = form.number_input("num_papers_to_select", help="maximium number of papers to select and analyse - defaults to 8",
|
45 |
+
min_value=1, max_value=25, value=8, step=1, key='num_papers')
|
46 |
+
submit = form.form_submit_button('Submit')
|
47 |
+
|
48 |
+
if submit:
|
49 |
+
st.write("hello")
|
50 |
+
#if surveyor_obj is None:
|
51 |
+
# surveyor_obj = Surveyor()
|
52 |
+
#run_survey(surveyor_obj, research_keywords, max_search, num_papers)
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == '__main__':
|
56 |
+
global surveyor_obj
|
57 |
+
surveyor_obj = None
|
58 |
+
survey_space()
|
arxiv_public_data/__init__.py
ADDED
File without changes
|
arxiv_public_data/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (148 Bytes). View file
|
|
arxiv_public_data/__pycache__/config.cpython-310.pyc
ADDED
Binary file (1.44 kB). View file
|
|
arxiv_public_data/__pycache__/fixunicode.cpython-310.pyc
ADDED
Binary file (2.46 kB). View file
|
|
arxiv_public_data/__pycache__/fulltext.cpython-310.pyc
ADDED
Binary file (8.32 kB). View file
|
|
arxiv_public_data/__pycache__/internal_citations.cpython-310.pyc
ADDED
Binary file (4.27 kB). View file
|
|
arxiv_public_data/__pycache__/pdfstamp.cpython-310.pyc
ADDED
Binary file (1.73 kB). View file
|
|
arxiv_public_data/__pycache__/regex_arxiv.cpython-310.pyc
ADDED
Binary file (4.4 kB). View file
|
|
arxiv_public_data/authors.py
ADDED
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://github.com/arXiv/arxiv-base@32e6ad0
|
2 |
+
"""
|
3 |
+
Copyright 2017 Cornell University
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6 |
+
this software and associated documentation files (the "Software"), to deal in
|
7 |
+
the Software without restriction, including without limitation the rights to
|
8 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
9 |
+
of the Software, and to permit persons to whom the Software is furnished to do
|
10 |
+
so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
22 |
+
"""
|
23 |
+
|
24 |
+
"""Parse Authors lines to extract author and affiliation data."""
|
25 |
+
import re
|
26 |
+
import os
|
27 |
+
import gzip
|
28 |
+
import json
|
29 |
+
from itertools import dropwhile
|
30 |
+
from typing import Dict, Iterator, List, Tuple
|
31 |
+
from multiprocessing import Pool, cpu_count
|
32 |
+
|
33 |
+
from arxiv_public_data.tex2utf import tex2utf
|
34 |
+
from arxiv_public_data.config import LOGGER, DIR_OUTPUT
|
35 |
+
|
36 |
+
logger = LOGGER.getChild('authorsplit')
|
37 |
+
|
38 |
+
PREFIX_MATCH = 'van|der|de|la|von|del|della|da|mac|ter|dem|di|vaziri'
|
39 |
+
|
40 |
+
"""
|
41 |
+
Takes data from an Author: line in the current arXiv abstract
|
42 |
+
file and returns a structured set of data:
|
43 |
+
|
44 |
+
author_list_ptr = [
|
45 |
+
[ author1_keyname, author1_firstnames, author1_suffix, affil1, affil2 ] ,
|
46 |
+
[ author2_keyname, author2_firstnames, author1_suffix, affil1 ] ,
|
47 |
+
[ author3_keyname, author3_firstnames, author1_suffix ]
|
48 |
+
]
|
49 |
+
|
50 |
+
Abstracted from Dienst software for OAI1 and other uses. This
|
51 |
+
routine should just go away when a better metadata structure is
|
52 |
+
adopted that deals with names and affiliations properly.
|
53 |
+
|
54 |
+
Must remember that there is at least one person one the archive
|
55 |
+
who has only one name, this should clearly be considered the key name.
|
56 |
+
|
57 |
+
Code originally written by Christina Scovel, Simeon Warner Dec99/Jan00
|
58 |
+
2000-10-16 - separated.
|
59 |
+
2000-12-07 - added support for suffix
|
60 |
+
2003-02-14 - get surname prefixes from arXiv::Filters::Index [Simeon]
|
61 |
+
2007-10-01 - created test script, some tidying [Simeon]
|
62 |
+
2018-05-25 - Translated from Perl to Python [Brian C.]
|
63 |
+
"""
|
64 |
+
|
65 |
+
|
66 |
+
def parse_author_affil(authors: str) -> List[List[str]]:
|
67 |
+
"""
|
68 |
+
Parse author line and returns an list of author and affiliation data.
|
69 |
+
|
70 |
+
The list for each author will have at least three elements for
|
71 |
+
keyname, firstname(s) and suffix. The keyname will always have content
|
72 |
+
but the other strings might be empty strings if there is no firstname
|
73 |
+
or suffix. Any additional elements after the first three are affiliations,
|
74 |
+
there may be zero or more.
|
75 |
+
|
76 |
+
Handling of prefix "XX collaboration" etc. is duplicated here and in
|
77 |
+
arXiv::HTML::AuthorLink -- it shouldn't be. Likely should just be here.
|
78 |
+
|
79 |
+
This routine is just a wrapper around the two parts that first split
|
80 |
+
the authors line into parts, and then back propagate the affiliations.
|
81 |
+
The first part is to be used along for display where we do not want
|
82 |
+
to back propagate affiliation information.
|
83 |
+
|
84 |
+
:param authors: string of authors from abs file or similar
|
85 |
+
:return:
|
86 |
+
Returns a structured set of data:
|
87 |
+
author_list_ptr = [
|
88 |
+
[ author1_keyname, author1_firstnames, author1_suffix, affil1, affil2 ],
|
89 |
+
[ author2_keyname, author2_firstnames, author1_suffix, affil1 ] ,
|
90 |
+
[ author3_keyname, author3_firstnames, author1_suffix ]
|
91 |
+
]
|
92 |
+
"""
|
93 |
+
return _parse_author_affil_back_propagate(
|
94 |
+
**_parse_author_affil_split(authors))
|
95 |
+
|
96 |
+
|
97 |
+
def _parse_author_affil_split(author_line: str) -> Dict:
|
98 |
+
"""
|
99 |
+
Split author line into author and affiliation data.
|
100 |
+
|
101 |
+
Take author line, tidy spacing and punctuation, and then split up into
|
102 |
+
individual author an affiliation data. Has special cases to avoid splitting
|
103 |
+
an initial collaboration name and records in $back_propagate_affiliation_to
|
104 |
+
the fact that affiliations should not be back propagated to collaboration
|
105 |
+
names.
|
106 |
+
|
107 |
+
Does not handle multiple collaboration names.
|
108 |
+
"""
|
109 |
+
if not author_line:
|
110 |
+
return {'author_list': [], 'back_prop': 0}
|
111 |
+
|
112 |
+
names: List[str] = split_authors(author_line)
|
113 |
+
if not names:
|
114 |
+
return {'author_list': [], 'back_prop': 0}
|
115 |
+
|
116 |
+
names = _remove_double_commas(names)
|
117 |
+
# get rid of commas at back
|
118 |
+
namesIter: Iterator[str] = reversed(
|
119 |
+
list(dropwhile(lambda x: x == ',', reversed(names))))
|
120 |
+
# get rid of commas at front
|
121 |
+
names = list(dropwhile(lambda x: x == ',', namesIter))
|
122 |
+
|
123 |
+
# Extract all names (all parts not starting with comma or paren)
|
124 |
+
names = list(map(_tidy_name, filter(
|
125 |
+
lambda x: re.match('^[^](,]', x), names)))
|
126 |
+
names = list(filter(lambda n: not re.match(
|
127 |
+
r'^\s*et\.?\s+al\.?\s*', n, flags=re.IGNORECASE), names))
|
128 |
+
|
129 |
+
(names, author_list,
|
130 |
+
back_propagate_affiliations_to) = _collaboration_at_start(names)
|
131 |
+
|
132 |
+
(enumaffils) = _enum_collaboration_at_end(author_line)
|
133 |
+
|
134 |
+
# Split name into keyname and firstnames/initials.
|
135 |
+
# Deal with different patterns in turn: prefixes, suffixes, plain
|
136 |
+
# and single name.
|
137 |
+
patterns = [('double-prefix',
|
138 |
+
r'^(.*)\s+(' + PREFIX_MATCH + r')\s(' +
|
139 |
+
PREFIX_MATCH + r')\s(\S+)$'),
|
140 |
+
('name-prefix-name',
|
141 |
+
r'^(.*)\s+(' + PREFIX_MATCH + r')\s(\S+)$'),
|
142 |
+
('name-name-prefix',
|
143 |
+
r'^(.*)\s+(\S+)\s(I|II|III|IV|V|Sr|Jr|Sr\.|Jr\.)$'),
|
144 |
+
('name-name',
|
145 |
+
r'^(.*)\s+(\S+)$'), ]
|
146 |
+
|
147 |
+
# Now go through names in turn and try to get affiliations
|
148 |
+
# to go with them
|
149 |
+
for name in names:
|
150 |
+
pattern_matches = ((mtype, re.match(m, name, flags=re.IGNORECASE))
|
151 |
+
for (mtype, m) in patterns)
|
152 |
+
|
153 |
+
(mtype, match) = next(((mtype, m)
|
154 |
+
for (mtype, m) in pattern_matches
|
155 |
+
if m is not None), ('default', None))
|
156 |
+
if match is None:
|
157 |
+
author_entry = [name, '', '']
|
158 |
+
elif mtype == 'double-prefix':
|
159 |
+
s = '{} {} {}'.format(match.group(
|
160 |
+
2), match.group(3), match.group(4))
|
161 |
+
author_entry = [s, match.group(1), '']
|
162 |
+
elif mtype == 'name-prefix-name':
|
163 |
+
s = '{} {}'.format(match.group(2), match.group(3))
|
164 |
+
author_entry = [s, match.group(1), '']
|
165 |
+
elif mtype == 'name-name-prefix':
|
166 |
+
author_entry = [match.group(2), match.group(1), match.group(3)]
|
167 |
+
elif mtype == 'name-name':
|
168 |
+
author_entry = [match.group(2), match.group(1), '']
|
169 |
+
else:
|
170 |
+
author_entry = [name, '', '']
|
171 |
+
|
172 |
+
# search back in author_line for affiliation
|
173 |
+
author_entry = _add_affiliation(
|
174 |
+
author_line, enumaffils, author_entry, name)
|
175 |
+
author_list.append(author_entry)
|
176 |
+
|
177 |
+
return {'author_list': author_list,
|
178 |
+
'back_prop': back_propagate_affiliations_to}
|
179 |
+
|
180 |
+
|
181 |
+
def parse_author_affil_utf(authors: str) -> List:
|
182 |
+
"""
|
183 |
+
Call parse_author_affil() and do TeX to UTF conversion.
|
184 |
+
|
185 |
+
Output structure is the same but should be in UTF and not TeX.
|
186 |
+
"""
|
187 |
+
if not authors:
|
188 |
+
return []
|
189 |
+
return list(map(lambda author: list(map(tex2utf, author)),
|
190 |
+
parse_author_affil(authors)))
|
191 |
+
|
192 |
+
|
193 |
+
def _remove_double_commas(items: List[str]) -> List[str]:
|
194 |
+
|
195 |
+
parts: List[str] = []
|
196 |
+
last = ''
|
197 |
+
for pt in items:
|
198 |
+
if pt == ',' and last == ',':
|
199 |
+
continue
|
200 |
+
else:
|
201 |
+
parts.append(pt)
|
202 |
+
last = pt
|
203 |
+
return parts
|
204 |
+
|
205 |
+
|
206 |
+
def _tidy_name(name: str) -> str:
|
207 |
+
name = re.sub(r'\s\s+', ' ', name) # also gets rid of CR
|
208 |
+
# add space after dot (except in TeX)
|
209 |
+
name = re.sub(r'(?<!\\)\.(\S)', r'. \g<1>', name)
|
210 |
+
return name
|
211 |
+
|
212 |
+
|
213 |
+
def _collaboration_at_start(names: List[str]) \
|
214 |
+
-> Tuple[List[str], List[List[str]], int]:
|
215 |
+
"""Perform special handling of collaboration at start."""
|
216 |
+
author_list = []
|
217 |
+
|
218 |
+
back_propagate_affiliations_to = 0
|
219 |
+
while len(names) > 0:
|
220 |
+
m = re.search(r'([a-z0-9\s]+\s+(collaboration|group|team))',
|
221 |
+
names[0], flags=re.IGNORECASE)
|
222 |
+
if not m:
|
223 |
+
break
|
224 |
+
|
225 |
+
# Add to author list
|
226 |
+
author_list.append([m.group(1), '', ''])
|
227 |
+
back_propagate_affiliations_to += 1
|
228 |
+
# Remove from names
|
229 |
+
names.pop(0)
|
230 |
+
# Also swallow and following comma or colon
|
231 |
+
if names and (names[0] == ',' or names[0] == ':'):
|
232 |
+
names.pop(0)
|
233 |
+
|
234 |
+
return names, author_list, back_propagate_affiliations_to
|
235 |
+
|
236 |
+
|
237 |
+
def _enum_collaboration_at_end(author_line: str)->Dict:
|
238 |
+
"""Get separate set of enumerated affiliations from end of author_line."""
|
239 |
+
# Now see if we have a separate set of enumerated affiliations
|
240 |
+
# This is indicated by finding '(\s*('
|
241 |
+
line_m = re.search(r'\(\s*\((.*)$', author_line)
|
242 |
+
if not line_m:
|
243 |
+
return {}
|
244 |
+
|
245 |
+
enumaffils = {}
|
246 |
+
affils = re.sub(r'\s*\)\s*$', '', line_m.group(1))
|
247 |
+
|
248 |
+
# Now expect to have '1) affil1 (2) affil2 (3) affil3'
|
249 |
+
for affil in affils.split('('):
|
250 |
+
# Now expect `1) affil1 ', discard if no match
|
251 |
+
m = re.match(r'^(\d+)\)\s*(\S.*\S)\s*$', affil)
|
252 |
+
if m:
|
253 |
+
enumaffils[m.group(1)] = re.sub(r'[\.,\s]*$', '', m.group(2))
|
254 |
+
|
255 |
+
return enumaffils
|
256 |
+
|
257 |
+
|
258 |
+
def _add_affiliation(author_line: str,
|
259 |
+
enumaffils: Dict,
|
260 |
+
author_entry: List[str],
|
261 |
+
name: str) -> List:
|
262 |
+
"""
|
263 |
+
Add author affiliation to author_entry if one is found in author_line.
|
264 |
+
|
265 |
+
This should deal with these cases
|
266 |
+
Smith B(labX) Smith B(1) Smith B(1, 2) Smith B(1 & 2) Smith B(1 and 2)
|
267 |
+
"""
|
268 |
+
en = re.escape(name)
|
269 |
+
namerex = r'{}\s*\(([^\(\)]+)'.format(en.replace(' ', 's*'))
|
270 |
+
m = re.search(namerex, author_line, flags=re.IGNORECASE)
|
271 |
+
if not m:
|
272 |
+
return author_entry
|
273 |
+
|
274 |
+
# Now see if we have enumerated references (just commas, digits, &, and)
|
275 |
+
affils = m.group(1).rstrip().lstrip()
|
276 |
+
affils = re.sub(r'(&|and)/,', ',', affils, flags=re.IGNORECASE)
|
277 |
+
|
278 |
+
if re.match(r'^[\d,\s]+$', affils):
|
279 |
+
for affil in affils.split(','):
|
280 |
+
if affil in enumaffils:
|
281 |
+
author_entry.append(enumaffils[affil])
|
282 |
+
else:
|
283 |
+
author_entry.append(affils)
|
284 |
+
|
285 |
+
return author_entry
|
286 |
+
|
287 |
+
|
288 |
+
def _parse_author_affil_back_propagate(author_list: List[List[str]],
|
289 |
+
back_prop: int) -> List[List[str]]:
|
290 |
+
"""Back propagate author affiliation.
|
291 |
+
|
292 |
+
Take the author list structure generated by parse_author_affil_split(..)
|
293 |
+
and propagate affiliation information backwards to preceeding author
|
294 |
+
entries where none was give. Stop before entry $back_prop to avoid
|
295 |
+
adding affiliation information to collaboration names.
|
296 |
+
|
297 |
+
given, eg:
|
298 |
+
a.b.first, c.d.second (affil)
|
299 |
+
implies
|
300 |
+
a.b.first (affil), c.d.second (affil)
|
301 |
+
and in more complex cases:
|
302 |
+
a.b.first, c.d.second (1), e.f.third, g.h.forth (2,3)
|
303 |
+
implies
|
304 |
+
a.b.first (1), c.d.second (1), e.f.third (2,3), g.h.forth (2,3)
|
305 |
+
"""
|
306 |
+
last_affil: List[str] = []
|
307 |
+
for x in range(len(author_list) - 1, max(back_prop - 1, -1), -1):
|
308 |
+
author_entry = author_list[x]
|
309 |
+
if len(author_entry) > 3: # author has affiliation,store
|
310 |
+
last_affil = author_entry
|
311 |
+
elif last_affil:
|
312 |
+
# author doesn't have affil but later one did => copy
|
313 |
+
author_entry.extend(last_affil[3:])
|
314 |
+
|
315 |
+
return author_list
|
316 |
+
|
317 |
+
|
318 |
+
def split_authors(authors: str) -> List:
|
319 |
+
"""
|
320 |
+
Split author string into authors entity lists.
|
321 |
+
|
322 |
+
Take an author line as a string and return a reference to a list of the
|
323 |
+
different name and affiliation blocks. While this does normalize spacing
|
324 |
+
and 'and', it is a key feature that the set of strings returned can be
|
325 |
+
concatenated to reproduce the original authors line. This code thus
|
326 |
+
provides a very graceful degredation for badly formatted authors lines, as
|
327 |
+
the text at least shows up.
|
328 |
+
"""
|
329 |
+
# split authors field into blocks with boundaries of ( and )
|
330 |
+
if not authors:
|
331 |
+
return []
|
332 |
+
aus = re.split(r'(\(|\))', authors)
|
333 |
+
aus = list(filter(lambda x: x != '', aus))
|
334 |
+
|
335 |
+
blocks = []
|
336 |
+
if len(aus) == 1:
|
337 |
+
blocks.append(authors)
|
338 |
+
else:
|
339 |
+
c = ''
|
340 |
+
depth = 0
|
341 |
+
for bit in aus:
|
342 |
+
if bit == '':
|
343 |
+
continue
|
344 |
+
if bit == '(': # track open parentheses
|
345 |
+
depth += 1
|
346 |
+
if depth == 1:
|
347 |
+
blocks.append(c)
|
348 |
+
c = '('
|
349 |
+
else:
|
350 |
+
c = c + bit
|
351 |
+
elif bit == ')': # track close parentheses
|
352 |
+
depth -= 1
|
353 |
+
c = c + bit
|
354 |
+
if depth == 0:
|
355 |
+
blocks.append(c)
|
356 |
+
c = ''
|
357 |
+
else: # haven't closed, so keep accumulating
|
358 |
+
continue
|
359 |
+
else:
|
360 |
+
c = c + bit
|
361 |
+
if c:
|
362 |
+
blocks.append(c)
|
363 |
+
|
364 |
+
listx = []
|
365 |
+
|
366 |
+
for block in blocks:
|
367 |
+
block = re.sub(r'\s+', ' ', block)
|
368 |
+
if re.match(r'^\(', block): # it is a comment
|
369 |
+
listx.append(block)
|
370 |
+
else: # it is a name
|
371 |
+
block = re.sub(r',?\s+(and|\&)\s', ',', block)
|
372 |
+
names = re.split(r'(,|:)\s*', block)
|
373 |
+
for name in names:
|
374 |
+
if not name:
|
375 |
+
continue
|
376 |
+
name = name.rstrip().lstrip()
|
377 |
+
if name:
|
378 |
+
listx.append(name)
|
379 |
+
|
380 |
+
# Recombine suffixes that were separated with a comma
|
381 |
+
parts: List[str] = []
|
382 |
+
for p in listx:
|
383 |
+
if re.match(r'^(Jr\.?|Sr\.?\[IV]{2,})$', p) \
|
384 |
+
and len(parts) >= 2 \
|
385 |
+
and parts[-1] == ',' \
|
386 |
+
and not re.match(r'\)$', parts[-2]):
|
387 |
+
separator = parts.pop()
|
388 |
+
last = parts.pop()
|
389 |
+
recomb = "{}{} {}".format(last, separator, p)
|
390 |
+
parts.append(recomb)
|
391 |
+
else:
|
392 |
+
parts.append(p)
|
393 |
+
|
394 |
+
return parts
|
395 |
+
|
396 |
+
def parse_authorline(authors: str) -> str:
|
397 |
+
"""
|
398 |
+
The external facing function from this module. Converts a complex authorline
|
399 |
+
into a simple one with only UTF-8.
|
400 |
+
|
401 |
+
Parameters
|
402 |
+
----------
|
403 |
+
authors : string
|
404 |
+
The raw author line from the metadata
|
405 |
+
|
406 |
+
Returns
|
407 |
+
-------
|
408 |
+
clean_authors : string
|
409 |
+
String represeting cleaned author line
|
410 |
+
|
411 |
+
Examples
|
412 |
+
--------
|
413 |
+
>>> parse_authorline('A. Losev, S. Shadrin, I. Shneiberg')
|
414 |
+
'Losev, A.; Shadrin, S.; Shneiberg, I.'
|
415 |
+
|
416 |
+
>>> parse_authorline("C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan")
|
417 |
+
'Balázs, C.; Berger, E. L.; Nadolsky, P. M.; Yuan, C. -P.'
|
418 |
+
|
419 |
+
>>> parse_authorline('Stephen C. Power (Lancaster University), Baruch Solel (Technion)')
|
420 |
+
'Power, Stephen C.; Solel, Baruch'
|
421 |
+
|
422 |
+
>>> parse_authorline("L. Scheck (1), H.-Th. Janka (1), T. Foglizzo (2), and K. Kifonidis (1)\n ((1) MPI for Astrophysics, Garching; (2) Service d'Astrophysique, CEA-Saclay)")
|
423 |
+
'Scheck, L.; Janka, H. -Th.; Foglizzo, T.; Kifonidis, K.'
|
424 |
+
"""
|
425 |
+
names = parse_author_affil_utf(authors)
|
426 |
+
return '; '.join([', '.join([q for q in n[:2] if q]) for n in names])
|
427 |
+
|
428 |
+
def _parse_article_authors(article_author):
|
429 |
+
try:
|
430 |
+
return [article_author[0], parse_author_affil_utf(article_author[1])]
|
431 |
+
except Exception as e:
|
432 |
+
msg = "Author split failed for article {}".format(article_author[0])
|
433 |
+
logger.error(msg)
|
434 |
+
logger.exception(e)
|
435 |
+
return [article_author[0], '']
|
436 |
+
|
437 |
+
def parse_authorline_parallel(article_authors, n_processes=None):
|
438 |
+
"""
|
439 |
+
Parallelize `parse_authorline`
|
440 |
+
Parameters
|
441 |
+
----------
|
442 |
+
article_authors : list
|
443 |
+
list of tuples (arXiv id, author strings from metadata)
|
444 |
+
(optional)
|
445 |
+
n_processes : int
|
446 |
+
number of processes
|
447 |
+
Returns
|
448 |
+
-------
|
449 |
+
authorsplit : list
|
450 |
+
list of author strings in standardized format
|
451 |
+
[
|
452 |
+
[ author1_keyname, author1_firstnames, author1_suffix, affil1,
|
453 |
+
affil2 ] ,
|
454 |
+
[ author2_keyname, author2_firstnames, author1_suffix, affil1 ] ,
|
455 |
+
[ author3_keyname, author3_firstnames, author1_suffix ]
|
456 |
+
]
|
457 |
+
"""
|
458 |
+
logger.info(
|
459 |
+
'Parsing author lines for {} articles...'.format(len(article_authors))
|
460 |
+
)
|
461 |
+
|
462 |
+
pool = Pool(n_processes)
|
463 |
+
parsed = pool.map(_parse_article_authors, article_authors)
|
464 |
+
outdict = {aid: auth for aid, auth in parsed}
|
465 |
+
|
466 |
+
filename = os.path.join(DIR_OUTPUT, 'authors-parsed.json.gz')
|
467 |
+
logger.info('Saving to {}'.format(filename))
|
468 |
+
with gzip.open(filename, 'wb') as fout:
|
469 |
+
fout.write(json.dumps(outdict).encode('utf-8'))
|
arxiv_public_data/config.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
|
5 |
+
logging.basicConfig(
|
6 |
+
level=logging.INFO,
|
7 |
+
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s'
|
8 |
+
)
|
9 |
+
baselog = logging.getLogger('arxivdata')
|
10 |
+
logger = baselog.getChild('config')
|
11 |
+
|
12 |
+
DEFAULT_PATH = os.path.join(os.path.abspath('/'), 'arxiv-data')
|
13 |
+
JSONFILE = './config.json'
|
14 |
+
KEY = 'ARXIV_DATA'
|
15 |
+
|
16 |
+
def get_outdir():
|
17 |
+
"""
|
18 |
+
Grab the outdir from:
|
19 |
+
1) Environment
|
20 |
+
2) config.json
|
21 |
+
3) default ($PWD/arxiv-data)
|
22 |
+
"""
|
23 |
+
if os.environ.get(KEY):
|
24 |
+
out = os.environ.get(KEY)
|
25 |
+
else:
|
26 |
+
if os.path.exists(JSONFILE):
|
27 |
+
js = json.load(open(JSONFILE))
|
28 |
+
if not KEY in js:
|
29 |
+
logger.warn('Configuration in "{}" invalid, using default'.format(JSONFILE))
|
30 |
+
logger.warn("default output directory is {}".format(DEFAULT_PATH))
|
31 |
+
out = DEFAULT_PATH
|
32 |
+
else:
|
33 |
+
out = js[KEY]
|
34 |
+
else:
|
35 |
+
logger.warn("default output directory is {}".format(DEFAULT_PATH))
|
36 |
+
out = DEFAULT_PATH
|
37 |
+
return out
|
38 |
+
|
39 |
+
try:
|
40 |
+
DIR_BASE = get_outdir()
|
41 |
+
except Exception as e:
|
42 |
+
logger.error(
|
43 |
+
"Error attempting to get path from ENV or json conf, "
|
44 |
+
"defaulting to current directory"
|
45 |
+
)
|
46 |
+
DIR_BASE = DEFAULT_PATH
|
47 |
+
|
48 |
+
DIR_FULLTEXT = os.path.join(DIR_BASE, 'fulltext')
|
49 |
+
DIR_PDFTARS = os.path.join(DIR_BASE, 'tarpdfs')
|
50 |
+
DIR_OUTPUT = os.path.join(DIR_BASE, 'output')
|
51 |
+
LOGGER = baselog
|
52 |
+
|
53 |
+
for dirs in [DIR_BASE, DIR_PDFTARS, DIR_FULLTEXT, DIR_OUTPUT]:
|
54 |
+
if not os.path.exists(dirs):
|
55 |
+
os.mkdir(dirs)
|
arxiv_public_data/embeddings/__init__.py
ADDED
File without changes
|
arxiv_public_data/embeddings/tf_hub.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
tf_hub.py
|
3 |
+
|
4 |
+
Find text embeddings using pre-trained TensorFlow Hub models
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import pickle
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
from arxiv_public_data.config import DIR_OUTPUT, LOGGER
|
12 |
+
from arxiv_public_data.embeddings.util import batch_fulltext
|
13 |
+
|
14 |
+
logger = LOGGER.getChild('embds')
|
15 |
+
|
16 |
+
try:
|
17 |
+
import tensorflow as tf
|
18 |
+
import tensorflow_hub as hub
|
19 |
+
import sentencepiece as spm
|
20 |
+
except ImportError as e:
|
21 |
+
logger.warn("This module requires 'tensorflow', 'tensorflow-hub', and"
|
22 |
+
"'sentencepiece'\n"
|
23 |
+
'Please install these modules to use tf_hub.py')
|
24 |
+
|
25 |
+
|
26 |
+
UNIV_SENTENCE_ENCODER_URL = ('https://tfhub.dev/google/'
|
27 |
+
'universal-sentence-encoder/2')
|
28 |
+
|
29 |
+
ELMO_URL = "https://tfhub.dev/google/elmo/2"
|
30 |
+
ELMO_KWARGS = dict(signature='default', as_dict=True)
|
31 |
+
ELMO_MODULE_KWARGS = dict(trainable=True)
|
32 |
+
ELMO_DICTKEY = 'default'
|
33 |
+
|
34 |
+
DIR_EMBEDDING = os.path.join(DIR_OUTPUT, 'embeddings')
|
35 |
+
if not os.path.exists(DIR_EMBEDDING):
|
36 |
+
os.mkdir(DIR_EMBEDDING)
|
37 |
+
|
38 |
+
def elmo_strings(batches, filename, batchsize=32):
|
39 |
+
"""
|
40 |
+
Compute and save vector embeddings of lists of strings in batches
|
41 |
+
Parameters
|
42 |
+
----------
|
43 |
+
batches : iterable of strings to be embedded
|
44 |
+
filename : str
|
45 |
+
filename to store embeddings
|
46 |
+
(optional)
|
47 |
+
batchsize : int
|
48 |
+
size of batches
|
49 |
+
"""
|
50 |
+
g = tf.Graph()
|
51 |
+
with g.as_default():
|
52 |
+
module = hub.Module(ELMO_URL, **ELMO_MODULE_KWARGS)
|
53 |
+
text_input = tf.placeholder(dtype=tf.string, shape=[None])
|
54 |
+
embeddings = module(text_input, **ELMO_KWARGS)
|
55 |
+
init_op = tf.group([tf.global_variables_initializer(),
|
56 |
+
tf.tables_initializer()])
|
57 |
+
g.finalize()
|
58 |
+
|
59 |
+
with tf.Session(graph=g) as sess:
|
60 |
+
sess.run(init_op)
|
61 |
+
|
62 |
+
for i, batch in enumerate(batches):
|
63 |
+
# grab mean-pooling of contextualized word reps
|
64 |
+
logger.info("Computing/saving batch {}".format(i))
|
65 |
+
with open(filename, 'ab') as fout:
|
66 |
+
pickle.dump(sess.run(
|
67 |
+
embeddings, feed_dict={text_input: batch}
|
68 |
+
)[ELMO_DICTKEY], fout)
|
69 |
+
|
70 |
+
UNIV_SENTENCE_LITE = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
|
71 |
+
|
72 |
+
def get_sentence_piece_model():
|
73 |
+
with tf.Session() as sess:
|
74 |
+
module = hub.Module(UNIV_SENTENCE_LITE)
|
75 |
+
return sess.run(module(signature="spm_path"))
|
76 |
+
|
77 |
+
def process_to_IDs_in_sparse_format(sp, sentences):
|
78 |
+
"""
|
79 |
+
An utility method that processes sentences with the sentence piece
|
80 |
+
processor
|
81 |
+
'sp' and returns the results in tf.SparseTensor-similar format:
|
82 |
+
(values, indices, dense_shape)
|
83 |
+
"""
|
84 |
+
ids = [sp.EncodeAsIds(x) for x in sentences]
|
85 |
+
max_len = max(len(x) for x in ids)
|
86 |
+
dense_shape=(len(ids), max_len)
|
87 |
+
values=[item for sublist in ids for item in sublist]
|
88 |
+
indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
|
89 |
+
return (values, indices, dense_shape)
|
90 |
+
|
91 |
+
def universal_sentence_encoder_lite(batches, filename, spm_path, batchsize=32):
|
92 |
+
"""
|
93 |
+
Compute and save vector embeddings of lists of strings in batches
|
94 |
+
Parameters
|
95 |
+
----------
|
96 |
+
batches : iterable of strings to be embedded
|
97 |
+
filename : str
|
98 |
+
filename to store embeddings
|
99 |
+
spm_path : str
|
100 |
+
path to sentencepiece model from `get_sentence_piece_model`
|
101 |
+
(optional)
|
102 |
+
batchsize : int
|
103 |
+
size of batches
|
104 |
+
"""
|
105 |
+
sp = spm.SentencePieceProcessor()
|
106 |
+
sp.Load(spm_path)
|
107 |
+
|
108 |
+
g = tf.Graph()
|
109 |
+
with g.as_default():
|
110 |
+
module = hub.Module(UNIV_SENTENCE_LITE)
|
111 |
+
input_placeholder = tf.sparse_placeholder(
|
112 |
+
tf.int64, shape=(None, None)
|
113 |
+
)
|
114 |
+
embeddings = module(
|
115 |
+
inputs=dict(
|
116 |
+
values=input_placeholder.values, indices=input_placeholder.indices,
|
117 |
+
dense_shape=input_placeholder.dense_shape
|
118 |
+
)
|
119 |
+
)
|
120 |
+
init_op = tf.group([tf.global_variables_initializer(),
|
121 |
+
tf.tables_initializer()])
|
122 |
+
g.finalize()
|
123 |
+
|
124 |
+
with tf.Session(graph=g) as sess:
|
125 |
+
sess.run(init_op)
|
126 |
+
for i, batch in enumerate(batches):
|
127 |
+
values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, batch)
|
128 |
+
logger.info("Computing/saving batch {}".format(i))
|
129 |
+
emb = sess.run(
|
130 |
+
embeddings,
|
131 |
+
feed_dict={
|
132 |
+
input_placeholder.values: values,
|
133 |
+
input_placeholder.indices: indices,
|
134 |
+
input_placeholder.dense_shape: dense_shape
|
135 |
+
}
|
136 |
+
)
|
137 |
+
with open(filename, 'ab') as fout:
|
138 |
+
pickle.dump(emb, fout)
|
139 |
+
|
140 |
+
def create_save_embeddings(batches, filename, encoder, headers=[], encoder_args=(),
|
141 |
+
encoder_kwargs={}, savedir=DIR_EMBEDDING):
|
142 |
+
"""
|
143 |
+
Create vector embeddings of strings and save them to filename
|
144 |
+
Parameters
|
145 |
+
----------
|
146 |
+
batches : iterator of strings
|
147 |
+
filename: str
|
148 |
+
embeddings will be saved in DIR_EMBEDDING/embeddings/filename
|
149 |
+
encoder : function(batches, savename, *args, **kwargs)
|
150 |
+
encodes strings in batches into vectors and saves them
|
151 |
+
(optional)
|
152 |
+
headers : list of things to save in embeddings file first
|
153 |
+
|
154 |
+
Examples
|
155 |
+
--------
|
156 |
+
# For list of strings, create batched numpy array of objects
|
157 |
+
batches = np.array_split(
|
158 |
+
np.array(strings, dtype='object'), len(strings)//batchsize
|
159 |
+
)
|
160 |
+
headers = []
|
161 |
+
|
162 |
+
# For the fulltext which cannot fit in memory, use `util.batch_fulltext`
|
163 |
+
md_index, all_ids, batch_gen = batch_fulltext()
|
164 |
+
headers = [md_index, all_ids]
|
165 |
+
|
166 |
+
# Universal Sentence Encoder Lite:
|
167 |
+
spm_path = get_sentence_piece_model()
|
168 |
+
create_save_embeddings(batches, filename, universal_sentence_encoder_lite,
|
169 |
+
headers=headers, encoder_args=(spm_path,))
|
170 |
+
|
171 |
+
# ELMO:
|
172 |
+
create_save_embeddings(strings, filename, elmo_strings, headers=headers)
|
173 |
+
"""
|
174 |
+
if not os.path.exists(savedir):
|
175 |
+
os.makedirs(savedir)
|
176 |
+
|
177 |
+
savename = os.path.join(savedir, filename)
|
178 |
+
|
179 |
+
with open(savename, 'ab') as fout:
|
180 |
+
for h in headers:
|
181 |
+
pickle.dump(h, fout)
|
182 |
+
|
183 |
+
logger.info("Saving embeddings to {}".format(savename))
|
184 |
+
encoder(batches, savename, *encoder_args,
|
185 |
+
**encoder_kwargs)
|
arxiv_public_data/embeddings/util.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
util.py
|
3 |
+
|
4 |
+
author: Colin Clement
|
5 |
+
date: 2019-04-05
|
6 |
+
|
7 |
+
This module contains helper functions for loading embeddings and batch
|
8 |
+
loading the full text, since many computers cannot contain the whole
|
9 |
+
fulltext in memory.
|
10 |
+
"""
|
11 |
+
|
12 |
+
import os
|
13 |
+
import re
|
14 |
+
import numpy as np
|
15 |
+
import pickle
|
16 |
+
|
17 |
+
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT
|
18 |
+
from arxiv_public_data.oai_metadata import load_metadata
|
19 |
+
|
20 |
+
def id_to_pathname(aid):
|
21 |
+
"""
|
22 |
+
Make filename path for text document, matching the format of fulltext
|
23 |
+
creation in `s3_bulk_download`
|
24 |
+
Parameters
|
25 |
+
----------
|
26 |
+
aid : str
|
27 |
+
string of arXiv article id as found in metadata
|
28 |
+
Returns
|
29 |
+
-------
|
30 |
+
pathname : str
|
31 |
+
pathname in which to store the article following
|
32 |
+
Examples
|
33 |
+
--------
|
34 |
+
>>> id_to_pathname('hep-ph/0001001') #doctest: +ELLIPSIS
|
35 |
+
'.../hep-ph/0001/hep-ph0001001.txt'
|
36 |
+
|
37 |
+
>>> id_to_pathname('1501.13851') #doctest: +ELLIPSIS
|
38 |
+
'.../arxiv/1501/1501.13851.txt'
|
39 |
+
"""
|
40 |
+
if '.' in aid: # new style ArXiv ID
|
41 |
+
yymm = aid.split('.')[0]
|
42 |
+
return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')
|
43 |
+
|
44 |
+
# old style ArXiv ID
|
45 |
+
cat, arxiv_id = re.split(r'(\d+)', aid)[:2]
|
46 |
+
yymm = arxiv_id[:4]
|
47 |
+
return os.path.join(DIR_FULLTEXT, cat, yymm, aid.replace('/', '') + '.txt')
|
48 |
+
|
49 |
+
def load_generator(paths, batchsize):
|
50 |
+
"""
|
51 |
+
Creates a generator object for batch loading files from paths
|
52 |
+
Parameters
|
53 |
+
----------
|
54 |
+
paths : list of filepaths
|
55 |
+
batchsize : int
|
56 |
+
Returns
|
57 |
+
-------
|
58 |
+
file_contents : list of strings of contents of files in path
|
59 |
+
"""
|
60 |
+
assert type(paths) is list, 'Requires a list of paths'
|
61 |
+
assert type(batchsize) is int, 'batchsize must be an int'
|
62 |
+
assert batchsize > 0, 'batchsize must be positive'
|
63 |
+
|
64 |
+
out = []
|
65 |
+
for p in paths:
|
66 |
+
with open(p, 'r') as fin:
|
67 |
+
out.append(fin.read())
|
68 |
+
if len(out) == batchsize:
|
69 |
+
yield np.array(out, dtype='object')
|
70 |
+
out = []
|
71 |
+
yield out
|
72 |
+
|
73 |
+
def batch_fulltext(batchsize=32, maxnum=None):
|
74 |
+
"""
|
75 |
+
Read metadata and find corresponding files in the fulltext
|
76 |
+
Parameters
|
77 |
+
----------
|
78 |
+
(optional)
|
79 |
+
batchsize : int
|
80 |
+
number of fulltext files to load into a batch
|
81 |
+
maxnum : int
|
82 |
+
the maximum number of paths to feed the generator, for
|
83 |
+
testing purposes
|
84 |
+
Returns
|
85 |
+
-------
|
86 |
+
md_index, all_ids, load_gen : tuple of (list, list, generator)
|
87 |
+
md_index is a mapping of existing fulltext files, in order
|
88 |
+
of their appearance, and containing the index of corresponding
|
89 |
+
metadata. all_ids is a list of all arXiv IDs in the metadata.
|
90 |
+
load_gen is a generator which allows batched loading of the
|
91 |
+
full-text, as defined by `load_generator`
|
92 |
+
"""
|
93 |
+
all_ids = [m['id'] for m in load_metadata()]
|
94 |
+
all_paths = [id_to_pathname(aid) for aid in all_ids]
|
95 |
+
exists = [os.path.exists(p) for p in all_paths]
|
96 |
+
existing_paths = [p for p, e in zip(all_paths, exists) if e][:maxnum]
|
97 |
+
md_index = [i for i, e in enumerate(exists) if e]
|
98 |
+
return md_index, all_ids, load_generator(existing_paths, batchsize)
|
99 |
+
|
100 |
+
def load_embeddings(filename, headers=0):
|
101 |
+
"""
|
102 |
+
Loads vector embeddings
|
103 |
+
Parameters
|
104 |
+
----------
|
105 |
+
filename : str
|
106 |
+
path to vector embeddings saved by `create_save_embeddings`
|
107 |
+
(optional)
|
108 |
+
headers : int
|
109 |
+
number of pickle calls containing metadata separate from the graphs
|
110 |
+
Returns
|
111 |
+
-------
|
112 |
+
embeddings : dict
|
113 |
+
keys 'embeddings' containing vector embeddings and
|
114 |
+
'headers' containining metadata
|
115 |
+
"""
|
116 |
+
out = {'embeddings': [], 'headers': []}
|
117 |
+
N = 0
|
118 |
+
with open(filename, 'rb') as fin:
|
119 |
+
while True:
|
120 |
+
try:
|
121 |
+
if N < headers:
|
122 |
+
out['headers'].append(pickle.load(fin))
|
123 |
+
else:
|
124 |
+
out['embeddings'].extend(pickle.load(fin))
|
125 |
+
except EOFError as e:
|
126 |
+
break
|
127 |
+
N += 1
|
128 |
+
out['embeddings'] = np.array(out['embeddings'])
|
129 |
+
return out
|
130 |
+
|
131 |
+
def fill_zeros(loaded_embedding):
|
132 |
+
"""
|
133 |
+
Fill out zeros in the full-text embedding where full-text is missing
|
134 |
+
Parameters
|
135 |
+
----------
|
136 |
+
loaded_embedding : dict
|
137 |
+
dict as saved from with `load_embeddings` with 2 headers
|
138 |
+
of the list of the metadata_index each embedding vector corresponds
|
139 |
+
to, the list of all article ids
|
140 |
+
Returns
|
141 |
+
-------
|
142 |
+
embeddings : array_like
|
143 |
+
vector embeddings of shape (number of articles, embedding dimension)
|
144 |
+
"""
|
145 |
+
md_index = loaded_embedding['headers'][0]
|
146 |
+
all_ids = loaded_embedding['headers'][1]
|
147 |
+
vectors = loaded_embedding['embeddings']
|
148 |
+
output = np.zeros((len(all_ids), vectors.shape[1]))
|
149 |
+
for idx, v in zip(md_index, vectors):
|
150 |
+
output[idx,:] = v
|
151 |
+
return output
|
arxiv_public_data/fixunicode.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
|
5 |
+
"""
|
6 |
+
List of ligatures: https://en.wikipedia.org/wiki/Typographic_ligature
|
7 |
+
MKB removed the following elements from the list:
|
8 |
+
- et 🙰 U+1F670 🙰
|
9 |
+
- ſs, ſz ẞ, ß U+00DF ß
|
10 |
+
|
11 |
+
Additional notes:
|
12 |
+
* Some classes of characters were listed in the original utf8 fixes but I'm not
|
13 |
+
sure they don't belong elsewhere (end user processing). In these cases, pass
|
14 |
+
through unidecode should normalize them to proper ascii. They are listed here
|
15 |
+
with reasoning:
|
16 |
+
|
17 |
+
- Ditch combining diacritics http://unicode.org/charts/PDF/U0300.pdf
|
18 |
+
r'[\u0300-\u036F]': ''
|
19 |
+
|
20 |
+
- Ditch chars that sometimes (incorrectly?) appear as combining diacritics
|
21 |
+
r'(?:\xa8|[\u02C0-\u02DF])': ''
|
22 |
+
|
23 |
+
* Should we run ftfy?
|
24 |
+
"""
|
25 |
+
|
26 |
+
ligature_table = """
|
27 |
+
AA, aa Ꜳ, ꜳ U+A732, U+A733 Ꜳ ꜳ
|
28 |
+
AE, ae Æ, æ U+00C6, U+00E6 Æ æ
|
29 |
+
AO, ao Ꜵ, ꜵ U+A734, U+A735 Ꜵ ꜵ
|
30 |
+
AU, au Ꜷ, ꜷ U+A736, U+A737 Ꜷ ꜷ
|
31 |
+
AV, av Ꜹ, ꜹ U+A738, U+A739 Ꜹ ꜹ
|
32 |
+
AV, av Ꜻ, ꜻ U+A73A, U+A73B Ꜻ ꜻ
|
33 |
+
AY, ay Ꜽ, ꜽ U+A73C, U+A73D Ꜽ ꜽ
|
34 |
+
ff ff U+FB00 ff
|
35 |
+
ffi ffi U+FB03 ffi
|
36 |
+
ffl ffl U+FB04 ffl
|
37 |
+
fi fi U+FB01 fi
|
38 |
+
fl fl U+FB02 fl
|
39 |
+
OE, oe Œ, œ U+0152, U+0153 Œ œ
|
40 |
+
OO, oo Ꝏ, ꝏ U+A74E, U+A74F Ꝏ ꝏ
|
41 |
+
st st U+FB06 st
|
42 |
+
ſt ſt U+FB05 ſt
|
43 |
+
TZ, tz Ꜩ, ꜩ U+A728, U+A729 Ꜩ ꜩ
|
44 |
+
ue ᵫ U+1D6B ᵫ
|
45 |
+
VY, vy Ꝡ, ꝡ U+A760, U+A761 Ꝡ ꝡ
|
46 |
+
db ȸ U+0238 ȸ
|
47 |
+
dz ʣ U+02A3 ʣ
|
48 |
+
dʑ ʥ U+02A5 ʥ
|
49 |
+
dʒ ʤ U+02A4 ʤ
|
50 |
+
fŋ ʩ U+02A9 ʩ
|
51 |
+
IJ, ij IJ, ij U+0132, U+0133 IJ ij
|
52 |
+
ls ʪ U+02AA ʪ
|
53 |
+
lz ʫ U+02AB ʫ
|
54 |
+
lʒ ɮ U+026E ɮ
|
55 |
+
qp ȹ U+0239 ȹ
|
56 |
+
tɕ ʨ U+02A8 ʨ
|
57 |
+
ts ʦ U+02A6 ʦ
|
58 |
+
tʃ ʧ U+02A7 ʧ
|
59 |
+
ui ꭐ U+AB50 ꭐ
|
60 |
+
ui ꭑ U+AB51 ꭐ
|
61 |
+
"""
|
62 |
+
|
63 |
+
unicode_mapping = {}
|
64 |
+
|
65 |
+
for row in ligature_table.split('\n'):
|
66 |
+
if row.count('\t') <= 1:
|
67 |
+
continue
|
68 |
+
|
69 |
+
unicode_mapping.update(
|
70 |
+
{
|
71 |
+
u.strip(): unicodedata.normalize('NFKC', a.strip())
|
72 |
+
for a, u in zip(*[c.split(',') for c in row.split('\t')[:2]])
|
73 |
+
}
|
74 |
+
)
|
75 |
+
|
76 |
+
unicode_mapping.update({
|
77 |
+
# 'ẞ, ß': careful, some use this for \beta
|
78 |
+
r'(\B)\u00DF': r'\1ss',
|
79 |
+
|
80 |
+
# Additions (manual normalization that we feel is important)
|
81 |
+
# unicode space u'\xa0' (not \x{0c} = ^L keep!)
|
82 |
+
'\xa0': ' ',
|
83 |
+
|
84 |
+
# single + double quotes, dash, and asterisk
|
85 |
+
r'[\u2018\u2019]': r"'",
|
86 |
+
r'[\u201C\u201D]': r'"',
|
87 |
+
r'[\xad\u2014]': r'-',
|
88 |
+
r'\xb7': r'*'
|
89 |
+
})
|
90 |
+
|
91 |
+
|
92 |
+
def fix_unicode(txt: str) -> str:
|
93 |
+
"""
|
94 |
+
Given UTF-8 encoded text, remove typographical ligatures (normalize to true
|
95 |
+
non-display character set) and do a general normalization of the unicode
|
96 |
+
so that possible redundant characters and simplified to a single set.
|
97 |
+
|
98 |
+
Parameters
|
99 |
+
----------
|
100 |
+
txt : unicode string
|
101 |
+
|
102 |
+
Returns
|
103 |
+
-------
|
104 |
+
output : unicode string
|
105 |
+
"""
|
106 |
+
for search, replace in unicode_mapping.items():
|
107 |
+
txt = re.subn(search, replace, txt)[0]
|
108 |
+
return unicodedata.normalize('NFKC', txt)
|
arxiv_public_data/fulltext.py
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import sys
|
4 |
+
import glob
|
5 |
+
import shlex
|
6 |
+
from functools import partial
|
7 |
+
|
8 |
+
from multiprocessing import Pool
|
9 |
+
from subprocess import check_call, CalledProcessError, TimeoutExpired, PIPE
|
10 |
+
|
11 |
+
from arxiv_public_data.config import LOGGER
|
12 |
+
from arxiv_public_data import fixunicode, pdfstamp
|
13 |
+
|
14 |
+
log = LOGGER.getChild('fulltext')
|
15 |
+
TIMELIMIT = 2*60
|
16 |
+
STAMP_SEARCH_LIMIT = 1000
|
17 |
+
|
18 |
+
PDF2TXT = 'pdf2txt.py'
|
19 |
+
PDFTOTEXT = 'pdftotext'
|
20 |
+
|
21 |
+
RE_REPEATS = r'(\(cid:\d+\)|lllll|\.\.\.\.\.|\*\*\*\*\*)'
|
22 |
+
|
23 |
+
|
24 |
+
def reextension(filename: str, extension: str) -> str:
|
25 |
+
""" Give a filename a new extension """
|
26 |
+
name, _ = os.path.splitext(filename)
|
27 |
+
return '{}.{}'.format(name, extension)
|
28 |
+
|
29 |
+
|
30 |
+
def average_word_length(txt):
|
31 |
+
"""
|
32 |
+
Gather statistics about the text, primarily the average word length
|
33 |
+
|
34 |
+
Parameters
|
35 |
+
----------
|
36 |
+
txt : str
|
37 |
+
|
38 |
+
Returns
|
39 |
+
-------
|
40 |
+
word_length : float
|
41 |
+
Average word length in the text
|
42 |
+
"""
|
43 |
+
#txt = re.subn(RE_REPEATS, '', txt)[0]
|
44 |
+
nw = len(txt.split())
|
45 |
+
nc = len(txt)
|
46 |
+
avgw = nc / (nw + 1)
|
47 |
+
return avgw
|
48 |
+
|
49 |
+
|
50 |
+
def process_timeout(cmd, timeout):
|
51 |
+
return check_call(cmd, timeout=timeout, stdout=PIPE, stderr=PIPE)
|
52 |
+
|
53 |
+
|
54 |
+
# ============================================================================
|
55 |
+
# functions for calling the text extraction services
|
56 |
+
# ============================================================================
|
57 |
+
def run_pdf2txt(pdffile: str, timelimit: int=TIMELIMIT, options: str=''):
|
58 |
+
"""
|
59 |
+
Run pdf2txt to extract full text
|
60 |
+
|
61 |
+
Parameters
|
62 |
+
----------
|
63 |
+
pdffile : str
|
64 |
+
Path to PDF file
|
65 |
+
|
66 |
+
timelimit : int
|
67 |
+
Amount of time to wait for the process to complete
|
68 |
+
|
69 |
+
Returns
|
70 |
+
-------
|
71 |
+
output : str
|
72 |
+
Full plain text output
|
73 |
+
"""
|
74 |
+
log.debug('Running {} on {}'.format(PDF2TXT, pdffile))
|
75 |
+
tmpfile = reextension(pdffile, 'pdf2txt')
|
76 |
+
|
77 |
+
cmd = '{cmd} {options} -o "{output}" "{pdf}"'.format(
|
78 |
+
cmd=PDF2TXT, options=options, output=tmpfile, pdf=pdffile
|
79 |
+
)
|
80 |
+
cmd = shlex.split(cmd)
|
81 |
+
output = process_timeout(cmd, timeout=timelimit)
|
82 |
+
|
83 |
+
with open(tmpfile) as f:
|
84 |
+
return f.read()
|
85 |
+
|
86 |
+
|
87 |
+
def run_pdftotext(pdffile: str, timelimit: int = TIMELIMIT) -> str:
|
88 |
+
"""
|
89 |
+
Run pdftotext on PDF file for extracted plain text
|
90 |
+
|
91 |
+
Parameters
|
92 |
+
----------
|
93 |
+
pdffile : str
|
94 |
+
Path to PDF file
|
95 |
+
|
96 |
+
timelimit : int
|
97 |
+
Amount of time to wait for the process to complete
|
98 |
+
|
99 |
+
Returns
|
100 |
+
-------
|
101 |
+
output : str
|
102 |
+
Full plain text output
|
103 |
+
"""
|
104 |
+
log.debug('Running {} on {}'.format(PDFTOTEXT, pdffile))
|
105 |
+
tmpfile = reextension(pdffile, 'pdftotxt')
|
106 |
+
|
107 |
+
cmd = '{cmd} "{pdf}" "{output}"'.format(
|
108 |
+
cmd=PDFTOTEXT, pdf=pdffile, output=tmpfile
|
109 |
+
)
|
110 |
+
cmd = shlex.split(cmd)
|
111 |
+
output = process_timeout(cmd, timeout=timelimit)
|
112 |
+
|
113 |
+
with open(tmpfile) as f:
|
114 |
+
return f.read()
|
115 |
+
|
116 |
+
|
117 |
+
def run_pdf2txt_A(pdffile: str, **kwargs) -> str:
|
118 |
+
"""
|
119 |
+
Run pdf2txt with the -A option which runs 'positional analysis on images'
|
120 |
+
and can return better results when pdf2txt combines many words together.
|
121 |
+
|
122 |
+
Parameters
|
123 |
+
----------
|
124 |
+
pdffile : str
|
125 |
+
Path to PDF file
|
126 |
+
|
127 |
+
kwargs : dict
|
128 |
+
Keyword arguments to :func:`run_pdf2txt`
|
129 |
+
|
130 |
+
Returns
|
131 |
+
-------
|
132 |
+
output : str
|
133 |
+
Full plain text output
|
134 |
+
"""
|
135 |
+
return run_pdf2txt(pdffile, options='-A', **kwargs)
|
136 |
+
|
137 |
+
|
138 |
+
# ============================================================================
|
139 |
+
# main function which extracts text
|
140 |
+
# ============================================================================
|
141 |
+
def fulltext(pdffile: str, timelimit: int = TIMELIMIT):
|
142 |
+
"""
|
143 |
+
Given a pdf file, extract the unicode text and run through very basic
|
144 |
+
unicode normalization routines. Determine the best extracted text and
|
145 |
+
return as a string.
|
146 |
+
|
147 |
+
Parameters
|
148 |
+
----------
|
149 |
+
pdffile : str
|
150 |
+
Path to PDF file from which to extract text
|
151 |
+
|
152 |
+
timelimit : int
|
153 |
+
Time in seconds to allow the extraction routines to run
|
154 |
+
|
155 |
+
Returns
|
156 |
+
-------
|
157 |
+
fulltext : str
|
158 |
+
The full plain text of the PDF
|
159 |
+
"""
|
160 |
+
if not os.path.isfile(pdffile):
|
161 |
+
raise FileNotFoundError(pdffile)
|
162 |
+
|
163 |
+
if os.stat(pdffile).st_size == 0: # file is empty
|
164 |
+
raise RuntimeError('"{}" is an empty file'.format(pdffile))
|
165 |
+
|
166 |
+
try:
|
167 |
+
output = run_pdftotext(pdffile, timelimit=timelimit)
|
168 |
+
#output = run_pdf2txt(pdffile, timelimit=timelimit)
|
169 |
+
except (TimeoutExpired, CalledProcessError, RuntimeError) as e:
|
170 |
+
output = run_pdf2txt(pdffile, timelimit=timelimit)
|
171 |
+
#output = run_pdftotext(pdffile, timelimit=timelimit)
|
172 |
+
|
173 |
+
output = fixunicode.fix_unicode(output)
|
174 |
+
#output = stamp.remove_stamp(output, split=STAMP_SEARCH_LIMIT)
|
175 |
+
wordlength = average_word_length(output)
|
176 |
+
|
177 |
+
if wordlength <= 45:
|
178 |
+
try:
|
179 |
+
os.remove(reextension(pdffile, 'pdftotxt')) # remove the tempfile
|
180 |
+
except OSError:
|
181 |
+
pass
|
182 |
+
|
183 |
+
return output
|
184 |
+
|
185 |
+
output = run_pdf2txt_A(pdffile, timelimit=timelimit)
|
186 |
+
output = fixunicode.fix_unicode(output)
|
187 |
+
#output = stamp.remove_stamp(output, split=STAMP_SEARCH_LIMIT)
|
188 |
+
wordlength = average_word_length(output)
|
189 |
+
|
190 |
+
if wordlength > 45:
|
191 |
+
raise RuntimeError(
|
192 |
+
'No accurate text could be extracted from "{}"'.format(pdffile)
|
193 |
+
)
|
194 |
+
|
195 |
+
try:
|
196 |
+
os.remove(reextension(pdffile, 'pdftotxt')) # remove the tempfile
|
197 |
+
except OSError:
|
198 |
+
pass
|
199 |
+
|
200 |
+
return output
|
201 |
+
|
202 |
+
|
203 |
+
def sorted_files(globber: str):
|
204 |
+
"""
|
205 |
+
Give a globbing expression of files to find. They will be sorted upon
|
206 |
+
return. This function is most useful when sorting does not provide
|
207 |
+
numerical order,
|
208 |
+
|
209 |
+
e.g.:
|
210 |
+
9 -> 12 returned as 10 11 12 9 by string sort
|
211 |
+
|
212 |
+
In this case use num_sort=True, and it will be sorted by numbers in the
|
213 |
+
string, then by the string itself.
|
214 |
+
|
215 |
+
Parameters
|
216 |
+
----------
|
217 |
+
globber : str
|
218 |
+
Expression on which to search for files (bash glob expression)
|
219 |
+
|
220 |
+
|
221 |
+
"""
|
222 |
+
files = glob.glob(globber, recursive = True) # return a list of path, including sub directories
|
223 |
+
files.sort()
|
224 |
+
|
225 |
+
allfiles = []
|
226 |
+
|
227 |
+
for fn in files:
|
228 |
+
nums = re.findall(r'\d+', fn) # regular expression, find number in path names
|
229 |
+
data = [str(int(n)) for n in nums] + [fn]
|
230 |
+
# a list of [first number, second number,..., filename] in string format otherwise sorted fill fail
|
231 |
+
allfiles.append(data) # list of list
|
232 |
+
|
233 |
+
allfiles = sorted(allfiles)
|
234 |
+
return [f[-1] for f in allfiles] # sorted filenames
|
235 |
+
|
236 |
+
|
237 |
+
def convert_directory(path: str, timelimit: int = TIMELIMIT):
|
238 |
+
"""
|
239 |
+
Convert all pdfs in a given `path` to full plain text. For each pdf, a file
|
240 |
+
of the same name but extension .txt will be created. If that file exists,
|
241 |
+
it will be skipped.
|
242 |
+
|
243 |
+
Parameters
|
244 |
+
----------
|
245 |
+
path : str
|
246 |
+
Directory in which to search for pdfs and convert to text
|
247 |
+
|
248 |
+
Returns
|
249 |
+
-------
|
250 |
+
output : list of str
|
251 |
+
List of converted files
|
252 |
+
"""
|
253 |
+
outlist = []
|
254 |
+
|
255 |
+
globber = os.path.join(path, '*.pdf')
|
256 |
+
pdffiles = sorted_files(globber)
|
257 |
+
|
258 |
+
log.info('Searching "{}"...'.format(globber))
|
259 |
+
log.info('Found: {} pdfs'.format(len(pdffiles)))
|
260 |
+
|
261 |
+
for pdffile in pdffiles:
|
262 |
+
txtfile = reextension(pdffile, 'txt')
|
263 |
+
|
264 |
+
if os.path.exists(txtfile):
|
265 |
+
continue
|
266 |
+
|
267 |
+
# we don't want this function to stop half way because of one failed
|
268 |
+
# file so just charge onto the next one
|
269 |
+
try:
|
270 |
+
text = fulltext(pdffile, timelimit)
|
271 |
+
with open(txtfile, 'w') as f:
|
272 |
+
f.write(text)
|
273 |
+
except Exception as e:
|
274 |
+
log.error("Conversion failed for '{}'".format(pdffile))
|
275 |
+
log.exception(e)
|
276 |
+
continue
|
277 |
+
|
278 |
+
outlist.append(pdffile)
|
279 |
+
return outlist
|
280 |
+
|
281 |
+
def convert_directory_parallel(path: str, processes: int, timelimit: int = TIMELIMIT):
|
282 |
+
"""
|
283 |
+
Convert all pdfs in a given `path` to full plain text. For each pdf, a file
|
284 |
+
of the same name but extension .txt will be created. If that file exists,
|
285 |
+
it will be skipped.
|
286 |
+
|
287 |
+
Parameters
|
288 |
+
----------
|
289 |
+
path : str
|
290 |
+
Directory in which to search for pdfs and convert to text
|
291 |
+
|
292 |
+
Returns
|
293 |
+
-------
|
294 |
+
output : list of str
|
295 |
+
List of converted files
|
296 |
+
"""
|
297 |
+
globber = os.path.join(path, '**/*.pdf') # search expression for glob.glob
|
298 |
+
pdffiles = sorted_files(globber) # a list of path
|
299 |
+
|
300 |
+
log.info('Searching "{}"...'.format(globber))
|
301 |
+
log.info('Found: {} pdfs'.format(len(pdffiles)))
|
302 |
+
|
303 |
+
pool = Pool(processes=processes)
|
304 |
+
result = pool.map(partial(convert_safe, timelimit=timelimit), pdffiles)
|
305 |
+
pool.close()
|
306 |
+
pool.join()
|
307 |
+
|
308 |
+
|
309 |
+
def convert_safe(pdffile: str, timelimit: int = TIMELIMIT):
|
310 |
+
""" Conversion function that never fails """
|
311 |
+
try:
|
312 |
+
convert(pdffile, timelimit=timelimit)
|
313 |
+
except Exception as e:
|
314 |
+
log.error('File conversion failed for {}: {}'.format(pdffile, e))
|
315 |
+
|
316 |
+
|
317 |
+
def convert(path: str, skipconverted=True, timelimit: int = TIMELIMIT) -> str:
|
318 |
+
"""
|
319 |
+
Convert a single PDF to text.
|
320 |
+
|
321 |
+
Parameters
|
322 |
+
----------
|
323 |
+
path : str
|
324 |
+
Location of a PDF file.
|
325 |
+
|
326 |
+
skipconverted : boolean
|
327 |
+
Skip conversion when there is a text file already
|
328 |
+
|
329 |
+
Returns
|
330 |
+
-------
|
331 |
+
str
|
332 |
+
Location of text file.
|
333 |
+
"""
|
334 |
+
if not os.path.exists(path):
|
335 |
+
raise RuntimeError('No such path: %s' % path)
|
336 |
+
outpath = reextension(path, 'txt')
|
337 |
+
|
338 |
+
if os.path.exists(outpath):
|
339 |
+
return outpath
|
340 |
+
|
341 |
+
try:
|
342 |
+
content = fulltext(path, timelimit)
|
343 |
+
with open(outpath, 'w') as f:
|
344 |
+
f.write(content)
|
345 |
+
except Exception as e:
|
346 |
+
msg = "Conversion failed for '%s': %s"
|
347 |
+
log.error(msg, path, e)
|
348 |
+
raise RuntimeError(msg % (path, e)) from e
|
349 |
+
return outpath
|
arxiv_public_data/internal_citations.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /usr/bin/env python
|
2 |
+
import time
|
3 |
+
import re
|
4 |
+
import sys
|
5 |
+
import glob
|
6 |
+
import os
|
7 |
+
import gzip
|
8 |
+
import json
|
9 |
+
import math
|
10 |
+
from multiprocessing import Pool,cpu_count
|
11 |
+
|
12 |
+
from arxiv_public_data.regex_arxiv import REGEX_ARXIV_FLEXIBLE, clean
|
13 |
+
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT, LOGGER
|
14 |
+
|
15 |
+
log = LOGGER.getChild('fulltext')
|
16 |
+
RE_FLEX = re.compile(REGEX_ARXIV_FLEXIBLE)
|
17 |
+
RE_OLDNAME_SPLIT = re.compile(r"([a-z\-]+)(\d+)")
|
18 |
+
|
19 |
+
|
20 |
+
def path_to_id(path):
|
21 |
+
""" Convert filepath name of ArXiv file to ArXiv ID """
|
22 |
+
name = os.path.splitext(os.path.basename(path))[0]
|
23 |
+
if '.' in name: # new ID
|
24 |
+
return name
|
25 |
+
split = [a for a in RE_OLDNAME_SPLIT.split(name) if a]
|
26 |
+
return "/".join(split)
|
27 |
+
|
28 |
+
|
29 |
+
def all_articles(directory=DIR_FULLTEXT):
|
30 |
+
""" Find all *.txt files in directory """
|
31 |
+
out = []
|
32 |
+
# make sure the path is absolute for os.walk
|
33 |
+
directory = os.path.abspath(os.path.expanduser(directory))
|
34 |
+
|
35 |
+
for root, dirs, files in os.walk(directory):
|
36 |
+
for f in files:
|
37 |
+
if 'txt' in f:
|
38 |
+
out.append(os.path.join(root, f))
|
39 |
+
|
40 |
+
return out
|
41 |
+
|
42 |
+
def extract_references(filename, pattern=RE_FLEX):
|
43 |
+
"""
|
44 |
+
Parameters
|
45 |
+
----------
|
46 |
+
filename : str
|
47 |
+
name of file to search for pattern
|
48 |
+
pattern : re pattern object
|
49 |
+
compiled regex pattern
|
50 |
+
|
51 |
+
Returns
|
52 |
+
-------
|
53 |
+
citations : list
|
54 |
+
list of found arXiv IDs
|
55 |
+
"""
|
56 |
+
out = []
|
57 |
+
with open(filename, 'r') as fn:
|
58 |
+
txt = fn.read()
|
59 |
+
|
60 |
+
for matches in pattern.findall(txt):
|
61 |
+
out.extend([clean(a) for a in matches if a])
|
62 |
+
return list(set(out))
|
63 |
+
|
64 |
+
def citation_list_inner(articles):
|
65 |
+
""" Find references in all the input articles
|
66 |
+
Parameters
|
67 |
+
----------
|
68 |
+
articles : list of str
|
69 |
+
list of paths to article text
|
70 |
+
Returns
|
71 |
+
-------
|
72 |
+
citations : dict[arXiv ID] = list of arXiv IDs
|
73 |
+
dictionary of articles and their references
|
74 |
+
"""
|
75 |
+
cites = {}
|
76 |
+
for i, article in enumerate(articles):
|
77 |
+
if i > 0 and i % 1000 == 0:
|
78 |
+
log.info('Completed {} articles'.format(i))
|
79 |
+
try:
|
80 |
+
refs = extract_references(article)
|
81 |
+
cites[path_to_id(article)] = refs
|
82 |
+
except:
|
83 |
+
log.error("Error in {}".format(article))
|
84 |
+
continue
|
85 |
+
return cites
|
86 |
+
|
87 |
+
|
88 |
+
def citation_list_parallel(N=cpu_count(), directory=DIR_FULLTEXT):
|
89 |
+
"""
|
90 |
+
Split the task of checking for citations across some number of processes
|
91 |
+
Parameters
|
92 |
+
----------
|
93 |
+
N : int
|
94 |
+
number of processes
|
95 |
+
directory: str
|
96 |
+
directory where full text files are stored
|
97 |
+
Returns
|
98 |
+
-------
|
99 |
+
citations : dict[arXiv ID] = list of arXiv IDs
|
100 |
+
all arXiv citations in all articles
|
101 |
+
"""
|
102 |
+
articles = all_articles(directory)
|
103 |
+
log.info('Calculating citation network for {} articles'.format(len(articles)))
|
104 |
+
|
105 |
+
pool = Pool(N)
|
106 |
+
|
107 |
+
A = len(articles)
|
108 |
+
divs = list(range(0, A, math.ceil(A/N))) + [A]
|
109 |
+
chunks = [articles[s:e] for s, e in zip(divs[:-1], divs[1:])]
|
110 |
+
|
111 |
+
cites = pool.map(citation_list_inner, chunks)
|
112 |
+
|
113 |
+
allcites = {}
|
114 |
+
for c in cites:
|
115 |
+
allcites.update(c)
|
116 |
+
return allcites
|
117 |
+
|
118 |
+
|
119 |
+
def default_filename():
|
120 |
+
return os.path.join(DIR_OUTPUT, 'internal-citations.json.gz')
|
121 |
+
|
122 |
+
|
123 |
+
def save_to_default_location(citations):
|
124 |
+
filename = default_filename()
|
125 |
+
|
126 |
+
log.info('Saving to "{}"'.format(filename))
|
127 |
+
with gzip.open(filename, 'wb') as fn:
|
128 |
+
fn.write(json.dumps(citations).encode('utf-8'))
|
arxiv_public_data/oai_metadata.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
oia_metadata.py
|
3 |
+
|
4 |
+
authors: Matt Bierbaum and Colin Clement
|
5 |
+
date: 2019-02-25
|
6 |
+
|
7 |
+
This module interacts with the Open Archive Initiative API, downloading
|
8 |
+
the metadata for all Arxiv articles.
|
9 |
+
|
10 |
+
Usage
|
11 |
+
=====
|
12 |
+
|
13 |
+
python oia_metadata.py data/<savefile>.json
|
14 |
+
|
15 |
+
Notes
|
16 |
+
=====
|
17 |
+
The save file is not technically JSON, but individual streamed lines of JSON,
|
18 |
+
each of which is compressed by gzip. Use the helper function load_metadata
|
19 |
+
to be sure to open it without error.
|
20 |
+
|
21 |
+
Resources
|
22 |
+
=========
|
23 |
+
* http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm
|
24 |
+
* https://arxiv.org/help/oa/index
|
25 |
+
"""
|
26 |
+
|
27 |
+
import os
|
28 |
+
import gzip
|
29 |
+
import glob
|
30 |
+
import json
|
31 |
+
import time
|
32 |
+
import hashlib
|
33 |
+
import datetime
|
34 |
+
import requests
|
35 |
+
import xml.etree.ElementTree as ET
|
36 |
+
|
37 |
+
from arxiv_public_data.config import LOGGER, DIR_BASE
|
38 |
+
|
39 |
+
log = LOGGER.getChild('metadata')
|
40 |
+
|
41 |
+
URL_ARXIV_OAI = 'https://export.arxiv.org/oai2'
|
42 |
+
URL_CITESEER_OAI = 'http://citeseerx.ist.psu.edu/oai2'
|
43 |
+
OAI_XML_NAMESPACES = {
|
44 |
+
'OAI': 'http://www.openarchives.org/OAI/2.0/',
|
45 |
+
'arXiv': 'http://arxiv.org/OAI/arXivRaw/'
|
46 |
+
}
|
47 |
+
|
48 |
+
def get_list_record_chunk(resumptionToken=None, harvest_url=URL_ARXIV_OAI,
|
49 |
+
metadataPrefix='arXivRaw'):
|
50 |
+
"""
|
51 |
+
Query OIA API for the metadata of 1000 Arxiv article
|
52 |
+
|
53 |
+
Parameters
|
54 |
+
----------
|
55 |
+
resumptionToken : str
|
56 |
+
Token for the API which triggers the next 1000 articles
|
57 |
+
|
58 |
+
Returns
|
59 |
+
-------
|
60 |
+
record_chunks : str
|
61 |
+
metadata of 1000 arXiv articles as an XML string
|
62 |
+
"""
|
63 |
+
parameters = {'verb': 'ListRecords'}
|
64 |
+
|
65 |
+
if resumptionToken:
|
66 |
+
parameters['resumptionToken'] = resumptionToken
|
67 |
+
else:
|
68 |
+
parameters['metadataPrefix'] = metadataPrefix
|
69 |
+
|
70 |
+
response = requests.get(harvest_url, params=parameters)
|
71 |
+
|
72 |
+
if response.status_code == 200:
|
73 |
+
return response.text
|
74 |
+
|
75 |
+
if response.status_code == 503:
|
76 |
+
secs = int(response.headers.get('Retry-After', 20)) * 1.5
|
77 |
+
log.info('Requested to wait, waiting {} seconds until retry...'.format(secs))
|
78 |
+
|
79 |
+
time.sleep(secs)
|
80 |
+
return get_list_record_chunk(resumptionToken=resumptionToken)
|
81 |
+
else:
|
82 |
+
raise Exception(
|
83 |
+
'Unknown error in HTTP request {}, status code: {}'.format(
|
84 |
+
response.url, response.status_code
|
85 |
+
)
|
86 |
+
)
|
87 |
+
|
88 |
+
def _record_element_text(elm, name):
|
89 |
+
""" XML helper function for extracting text from leaf (single-node) elements """
|
90 |
+
item = elm.find('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
|
91 |
+
return item.text if item is not None else None
|
92 |
+
|
93 |
+
def _record_element_all(elm, name):
|
94 |
+
""" XML helper function for extracting text from queries with multiple nodes """
|
95 |
+
return elm.findall('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
|
96 |
+
|
97 |
+
def parse_record(elm):
|
98 |
+
"""
|
99 |
+
Parse the XML element of a single ArXiv article into a dictionary of
|
100 |
+
attributes
|
101 |
+
|
102 |
+
Parameters
|
103 |
+
----------
|
104 |
+
elm : xml.etree.ElementTree.Element
|
105 |
+
Element of the record of a single ArXiv article
|
106 |
+
|
107 |
+
Returns
|
108 |
+
-------
|
109 |
+
output : dict
|
110 |
+
Attributes of the ArXiv article stored as a dict with the keys
|
111 |
+
id, submitter, authors, title, comments, journal-ref, doi, abstract,
|
112 |
+
report-no, categories, and version
|
113 |
+
"""
|
114 |
+
text_keys = [
|
115 |
+
'id', 'submitter', 'authors', 'title', 'comments',
|
116 |
+
'journal-ref', 'doi', 'abstract', 'report-no'
|
117 |
+
]
|
118 |
+
output = {key: _record_element_text(elm, key) for key in text_keys}
|
119 |
+
output['categories'] = [
|
120 |
+
i.text for i in (_record_element_all(elm, 'categories') or [])
|
121 |
+
]
|
122 |
+
output['versions'] = [
|
123 |
+
i.attrib['version'] for i in _record_element_all(elm, 'version')
|
124 |
+
]
|
125 |
+
return output
|
126 |
+
|
127 |
+
def parse_xml_listrecords(root):
|
128 |
+
"""
|
129 |
+
Parse XML of one chunk of the metadata of 1000 ArXiv articles
|
130 |
+
into a list of dictionaries
|
131 |
+
|
132 |
+
Parameters
|
133 |
+
----------
|
134 |
+
root : xml.etree.ElementTree.Element
|
135 |
+
Element containing the records of an entire chunk of ArXiv queries
|
136 |
+
|
137 |
+
Returns
|
138 |
+
-------
|
139 |
+
records, resumptionToken : list, str
|
140 |
+
records is a list of 1000 dictionaries, each containing the
|
141 |
+
attributes of a single arxiv article
|
142 |
+
resumptionToken is a string which is fed into the subsequent query
|
143 |
+
"""
|
144 |
+
resumptionToken = root.find(
|
145 |
+
'OAI:ListRecords/OAI:resumptionToken',
|
146 |
+
OAI_XML_NAMESPACES
|
147 |
+
)
|
148 |
+
resumptionToken = resumptionToken.text if resumptionToken is not None else ''
|
149 |
+
|
150 |
+
records = root.findall(
|
151 |
+
'OAI:ListRecords/OAI:record/OAI:metadata/arXiv:arXivRaw',
|
152 |
+
OAI_XML_NAMESPACES
|
153 |
+
)
|
154 |
+
records = [parse_record(p) for p in records]
|
155 |
+
|
156 |
+
return records, resumptionToken
|
157 |
+
|
158 |
+
def check_xml_errors(root):
|
159 |
+
""" Check for, log, and raise any OAI service errors in the XML """
|
160 |
+
error = root.find('OAI:error', OAI_XML_NAMESPACES)
|
161 |
+
|
162 |
+
if error is not None:
|
163 |
+
raise RuntimeError(
|
164 |
+
'OAI service returned error: {}'.format(error.text)
|
165 |
+
)
|
166 |
+
|
167 |
+
def find_default_locations():
|
168 |
+
outfile = os.path.join(DIR_BASE, 'arxiv-metadata-oai-*.json.gz')
|
169 |
+
resume = os.path.join(
|
170 |
+
DIR_BASE, 'arxiv-metadata-oai-*.json.gz-resumptionToken.txt'
|
171 |
+
)
|
172 |
+
fn_outfile = sorted(glob.glob(outfile))
|
173 |
+
fn_resume = sorted(glob.glob(resume))
|
174 |
+
|
175 |
+
if len(fn_outfile) > 0:
|
176 |
+
return fn_outfile[-1]
|
177 |
+
return None
|
178 |
+
|
179 |
+
def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True):
|
180 |
+
"""
|
181 |
+
Download the metadata for every article in the ArXiv via the OAI API
|
182 |
+
|
183 |
+
Parameters
|
184 |
+
----------
|
185 |
+
outfile : str (default './arxiv-metadata-oai-<date>.json')
|
186 |
+
name of file where data is stored, appending each chunk of 1000
|
187 |
+
articles.
|
188 |
+
resumptionToken : str (default None)
|
189 |
+
token which instructs the OAI server to continue feeding the next
|
190 |
+
chunk
|
191 |
+
autoresume : bool
|
192 |
+
If true, it looks for a saved resumptionToken in the file
|
193 |
+
<outfile>-resumptionToken.txt
|
194 |
+
"""
|
195 |
+
date = str(datetime.datetime.now()).split(' ')[0]
|
196 |
+
|
197 |
+
outfile = (
|
198 |
+
outfile or # user-supplied
|
199 |
+
find_default_locations() or # already in progress
|
200 |
+
os.path.join(
|
201 |
+
DIR_BASE, 'arxiv-metadata-oai-{}.json.gz'.format(date)
|
202 |
+
) # new file
|
203 |
+
)
|
204 |
+
|
205 |
+
directory = os.path.split(outfile)[0]
|
206 |
+
if directory and not os.path.exists(directory):
|
207 |
+
os.makedirs(directory)
|
208 |
+
tokenfile = '{}-resumptionToken.txt'.format(outfile)
|
209 |
+
chunk_index = 0
|
210 |
+
total_records = 0
|
211 |
+
|
212 |
+
log.info('Saving metadata to "{}"'.format(outfile))
|
213 |
+
|
214 |
+
resumptionToken = None
|
215 |
+
if autoresume:
|
216 |
+
try:
|
217 |
+
resumptionToken = open(tokenfile, 'r').read()
|
218 |
+
except Exception as e:
|
219 |
+
log.warn("No tokenfile found '{}'".format(tokenfile))
|
220 |
+
log.info("Starting download from scratch...")
|
221 |
+
|
222 |
+
while True:
|
223 |
+
log.info('Index {:4d} | Records {:7d} | resumptionToken "{}"'.format(
|
224 |
+
chunk_index, total_records, resumptionToken)
|
225 |
+
)
|
226 |
+
xml_root = ET.fromstring(get_list_record_chunk(resumptionToken))
|
227 |
+
check_xml_errors(xml_root)
|
228 |
+
records, resumptionToken = parse_xml_listrecords(xml_root)
|
229 |
+
|
230 |
+
chunk_index = chunk_index + 1
|
231 |
+
total_records = total_records + len(records)
|
232 |
+
|
233 |
+
with gzip.open(outfile, 'at', encoding='utf-8') as fout:
|
234 |
+
for rec in records:
|
235 |
+
fout.write(json.dumps(rec) + '\n')
|
236 |
+
if resumptionToken:
|
237 |
+
with open(tokenfile, 'w') as fout:
|
238 |
+
fout.write(resumptionToken)
|
239 |
+
else:
|
240 |
+
log.info('No resumption token, query finished')
|
241 |
+
return
|
242 |
+
|
243 |
+
time.sleep(12) # OAI server usually requires a 10s wait
|
244 |
+
|
245 |
+
def load_metadata(infile=None):
|
246 |
+
"""
|
247 |
+
Load metadata saved by all_of_arxiv, as a list of lines of gzip compressed
|
248 |
+
json.
|
249 |
+
|
250 |
+
Parameters
|
251 |
+
----------
|
252 |
+
infile : str or None
|
253 |
+
name of file saved by gzip. If None, one is attempted to be found
|
254 |
+
in the expected location with the expected name.
|
255 |
+
|
256 |
+
Returns
|
257 |
+
-------
|
258 |
+
article_attributes : list
|
259 |
+
list of dicts, each of which contains the metadata attributes of
|
260 |
+
the ArXiv articles
|
261 |
+
"""
|
262 |
+
fname = infile or find_default_locations()
|
263 |
+
with gzip.open(fname, 'rt', encoding='utf-8') as fin:
|
264 |
+
return [json.loads(line) for line in fin.readlines()]
|
265 |
+
|
266 |
+
def hash_abstracts(metadata):
|
267 |
+
""" Replace abstracts with their MD5 hash for legal distribution """
|
268 |
+
metadata_no_abstract = []
|
269 |
+
for i in range(len(metadata)):
|
270 |
+
m = metadata[i].copy()
|
271 |
+
m['abstract_md5'] = hashlib.md5(m['abstract'].encode()).hexdigest()
|
272 |
+
del m['abstract']
|
273 |
+
metadata_no_abstract.append(m)
|
274 |
+
return metadata_no_abstract
|
275 |
+
|
276 |
+
def validate_abstract_hashes(metadata, metadata_no_abstract):
|
277 |
+
""" Validate that abstracts match the hashes """
|
278 |
+
for m, n in zip(metadata, metadata_no_abstract):
|
279 |
+
md5 = hashlib.md5(m['abstract'].encode()).hexdigest()
|
280 |
+
if not md5 == n['abstract_md5']:
|
281 |
+
return False
|
282 |
+
return True
|
arxiv_public_data/pdfstamp.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
SPACE_DIGIT = r'\s*\d\s*'
|
4 |
+
SPACE_NUMBER = r'(?:{})+'.format(SPACE_DIGIT)
|
5 |
+
SPACE_CHAR = r'\s*[a-zA-Z\.-]\s*'
|
6 |
+
SPACE_WORD = r'(?:{})+'.format(SPACE_CHAR)
|
7 |
+
|
8 |
+
# old style ID, 7 digits in a row
|
9 |
+
RE_NUM_OLD = SPACE_DIGIT*7
|
10 |
+
|
11 |
+
# new style ID, 4 digits, ., 4,5 digits
|
12 |
+
RE_NUM_NEW = (
|
13 |
+
SPACE_DIGIT*4 +
|
14 |
+
r'\.' +
|
15 |
+
SPACE_DIGIT*4 + r'(?:{})?'.format(SPACE_DIGIT)
|
16 |
+
)
|
17 |
+
|
18 |
+
# the version part v1 V2 v 1, etc
|
19 |
+
RE_VERSION = r'(?:\s*[vV]\s*\d+\s*)?'
|
20 |
+
|
21 |
+
# the word arxiv, as printed by the autotex, arXiv
|
22 |
+
RE_ARXIV = r'\s*a\s*r\s*X\s*i\s*v\s*:\s*'
|
23 |
+
|
24 |
+
# any words within square brackets [cs.A I]
|
25 |
+
RE_CATEGORIES = r'\[{}\]'.format(SPACE_WORD)
|
26 |
+
|
27 |
+
# two digit date, month, year "29 Jan 2012"
|
28 |
+
RE_DATE = SPACE_NUMBER + SPACE_WORD + r'(?:{}){}'.format(SPACE_DIGIT, '{2,4}')
|
29 |
+
|
30 |
+
# the full identifier for the banner
|
31 |
+
RE_ARXIV_ID = (
|
32 |
+
RE_ARXIV +
|
33 |
+
r'(?:' +
|
34 |
+
r'(?:{})|(?:{})'.format(RE_NUM_NEW, RE_NUM_OLD) +
|
35 |
+
r')' +
|
36 |
+
RE_VERSION +
|
37 |
+
RE_CATEGORIES +
|
38 |
+
RE_DATE
|
39 |
+
)
|
40 |
+
|
41 |
+
REGEX_ARXIV_ID = re.compile(RE_ARXIV_ID)
|
42 |
+
|
43 |
+
|
44 |
+
def _extract_arxiv_stamp(txt):
|
45 |
+
"""
|
46 |
+
Find location of stamp within the text and remove that section
|
47 |
+
"""
|
48 |
+
match = REGEX_ARXIV_ID.search(txt)
|
49 |
+
|
50 |
+
if not match:
|
51 |
+
return txt, ''
|
52 |
+
|
53 |
+
s, e = match.span()
|
54 |
+
return '{} {}'.format(txt[:s].strip(), txt[e:].strip()), txt[s:e].strip()
|
55 |
+
|
56 |
+
|
57 |
+
def remove_stamp(txt, split=1000):
|
58 |
+
"""
|
59 |
+
Given full text, remove the stamp placed in the pdf by arxiv itself. This
|
60 |
+
deserves a bit of consideration since the stamp often becomes mangled by
|
61 |
+
the text extraction tool (i.e. hard to find and replace) and can be
|
62 |
+
reversed.
|
63 |
+
|
64 |
+
Parameters
|
65 |
+
----------
|
66 |
+
txt : string
|
67 |
+
The full text of a document
|
68 |
+
|
69 |
+
Returns
|
70 |
+
-------
|
71 |
+
out : string
|
72 |
+
Full text without stamp
|
73 |
+
"""
|
74 |
+
t0, t1 = txt[:split], txt[split:]
|
75 |
+
txt0, stamp0 = _extract_arxiv_stamp(t0)
|
76 |
+
txt1, stamp1 = _extract_arxiv_stamp(t0[::-1])
|
77 |
+
|
78 |
+
if stamp0:
|
79 |
+
return txt0 + t1
|
80 |
+
elif stamp1:
|
81 |
+
return txt1[::-1] + t1
|
82 |
+
else:
|
83 |
+
return txt
|
arxiv_public_data/regex_arxiv.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
regex_arxiv.py
|
3 |
+
|
4 |
+
author: Matt Bierbaum
|
5 |
+
date: 2019-03-14
|
6 |
+
|
7 |
+
RegEx patterns for finding arXiv id citations in fulltext articles.
|
8 |
+
"""
|
9 |
+
|
10 |
+
import re
|
11 |
+
|
12 |
+
# These are all the primary categories present in the OAI ArXiv metadata
|
13 |
+
CATEGORIES = [
|
14 |
+
"acc-phys", "adap-org", "alg-geom", "ao-sci", "astro-ph", "atom-ph",
|
15 |
+
"bayes-an", "chao-dyn", "chem-ph", "cmp-lg", "comp-gas", "cond-mat", "cs",
|
16 |
+
"dg-ga", "funct-an", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th",
|
17 |
+
"math", "math-ph", "mtrl-th", "nlin", "nucl-ex", "nucl-th", "patt-sol",
|
18 |
+
"physics", "plasm-ph", "q-alg", "q-bio", "quant-ph", "solv-int",
|
19 |
+
"supr-con", "eess", "econ", "q-fin", "stat"
|
20 |
+
]
|
21 |
+
|
22 |
+
# All subcategories with more than 2 capital letters (not SG, SI, SP, etc)
|
23 |
+
SUB_CATEGORIES = [
|
24 |
+
'acc-ph', 'ao-ph', 'app-ph', 'atm-clus', 'atom-ph', 'bio-ph', 'chem-ph',
|
25 |
+
'class-ph', 'comp-ph', 'data-an', 'dis-nn', 'ed-ph', 'flu-dyn', 'gen-ph',
|
26 |
+
'geo-ph', 'hist-ph', 'ins-det', 'med-ph', 'mes-hall', 'mtrl-sci', 'optics',
|
27 |
+
'other', 'plasm-ph', 'pop-ph', 'quant-gas', 'soc-ph', 'soft', 'space-ph',
|
28 |
+
'stat-mech', 'str-el', 'supr-con'
|
29 |
+
]
|
30 |
+
|
31 |
+
__all__ = (
|
32 |
+
'REGEX_ARXIV_SIMPLE',
|
33 |
+
'REGEX_ARXIV_STRICT',
|
34 |
+
'REGEX_ARXIV_FLEXIBLE'
|
35 |
+
)
|
36 |
+
|
37 |
+
dashdict = {c.replace('-', ''): c for c in CATEGORIES if '-' in c}
|
38 |
+
dashdict.update({c.replace('-', ''): c for c in SUB_CATEGORIES if '-' in c})
|
39 |
+
|
40 |
+
REGEX_VERSION_SPLITTER = re.compile(r'([vV][1-9]\d*)')
|
41 |
+
|
42 |
+
def strip_version(name):
|
43 |
+
""" 1501.21981v1 -> 1501.21981 """
|
44 |
+
return REGEX_VERSION_SPLITTER.split(name)[0]
|
45 |
+
|
46 |
+
def format_cat(name):
|
47 |
+
""" Strip subcategory, add hyphen to category name if missing """
|
48 |
+
if '/' in name: # OLD ID, names contains subcategory
|
49 |
+
catsubcat, aid = name.split('/')
|
50 |
+
cat = catsubcat.split('.')[0]
|
51 |
+
return dashdict.get(cat, cat) + "/" + aid
|
52 |
+
else:
|
53 |
+
return name
|
54 |
+
|
55 |
+
def zeropad_1501(name):
|
56 |
+
""" Arxiv IDs after yymm=1501 are padded to 5 zeros """
|
57 |
+
if not '/' in name: # new ID
|
58 |
+
yymm, num = name.split('.')
|
59 |
+
if int(yymm) > 1500 and len(num) < 5:
|
60 |
+
return yymm + ".0" + num
|
61 |
+
return name
|
62 |
+
|
63 |
+
def clean(name):
|
64 |
+
""" Correct common errors in ArXiv IDs to improve matching """
|
65 |
+
funcs = [strip_version, format_cat, zeropad_1501]
|
66 |
+
for func in funcs:
|
67 |
+
name = func(name)
|
68 |
+
return name
|
69 |
+
|
70 |
+
# A common typo is to exclude the hyphen in the category.
|
71 |
+
categories = list(set(CATEGORIES + [cat.replace('-', '') for cat in
|
72 |
+
CATEGORIES]))
|
73 |
+
subcategories = list(set(SUB_CATEGORIES + [cat.replace('-', '') for cat in
|
74 |
+
SUB_CATEGORIES]))
|
75 |
+
|
76 |
+
# capture possible minor categories
|
77 |
+
RE_CATEGORIES = r'(?:{})(?:(?:[.][A-Z]{{2}})|(?:{}))?'.format(
|
78 |
+
r'|'.join(categories), r'|'.join(subcategories)
|
79 |
+
)
|
80 |
+
|
81 |
+
# valid YYMM date, NOT preceded by any digits
|
82 |
+
# NOTE: at the date of writing, it is 2019, so we do not allow
|
83 |
+
# proper dates for YY 20 or larger
|
84 |
+
RE_DATE = r'(?:(?:[0-1][0-9])|(?:9[1-9]))(?:0[1-9]|1[0-2])'
|
85 |
+
RE_VERSION = r'(?:[vV][1-9]\d*)?'
|
86 |
+
|
87 |
+
# =============================================================================
|
88 |
+
RE_NUM_NEW = RE_DATE + r'(?:[.]\d{4,5})' + RE_VERSION
|
89 |
+
RE_NUM_OLD = RE_DATE + r'(?:\d{3})' + RE_VERSION
|
90 |
+
|
91 |
+
# matches: 1612.00001 1203.0023v2
|
92 |
+
RE_ID_NEW = r'(?:{})'.format(RE_NUM_NEW)
|
93 |
+
|
94 |
+
# matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2
|
95 |
+
RE_ID_OLD = r'(?:{}/{})'.format(RE_CATEGORIES, RE_NUM_OLD)
|
96 |
+
|
97 |
+
# =============================================================================
|
98 |
+
# matches: https://arxiv.org/abs/ abs/ arxiv.org/abs/
|
99 |
+
# 3. e-print: eprints
|
100 |
+
RE_PREFIX_URL = (
|
101 |
+
r'(?:'
|
102 |
+
r'(?i:http[s]?\://)?' # we could have a url prefix
|
103 |
+
r'(?i:arxiv\.org/)?' # maybe with the arxiv.org bit
|
104 |
+
r'(?i:abs/|pdf/)' # at least it has the abs/ part
|
105 |
+
r')'
|
106 |
+
)
|
107 |
+
|
108 |
+
# matches: arXiv: arxiv/ arxiv
|
109 |
+
RE_PREFIX_ARXIV = r'(?i:arxiv\s*[:/\s,.]*\s*)'
|
110 |
+
|
111 |
+
# matches: cs.AI/ cs.AI nucl-th
|
112 |
+
RE_PREFIX_CATEGORIES = r'(?i:{})'.format(RE_CATEGORIES)
|
113 |
+
|
114 |
+
# matches: e-prints: e-print eprints:
|
115 |
+
RE_PREFIX_EPRINT = r'(?i:e[-]?print[s]?.{1,3})'
|
116 |
+
|
117 |
+
# =============================================================================
|
118 |
+
# matches simple old or new identifiers, no fancy business
|
119 |
+
REGEX_ARXIV_SIMPLE = r'(?:{}|{})'.format(RE_ID_OLD, RE_ID_NEW)
|
120 |
+
|
121 |
+
# this one follows the guide set forth by:
|
122 |
+
# https://arxiv.org/help/arxiv_identifier
|
123 |
+
REGEX_ARXIV_STRICT = (
|
124 |
+
r'(?:{})'.format(RE_PREFIX_ARXIV) +
|
125 |
+
r'(?:'
|
126 |
+
r'({})'.format(RE_ID_OLD) +
|
127 |
+
r'|'
|
128 |
+
r'({})'.format(RE_ID_NEW) +
|
129 |
+
r')'
|
130 |
+
)
|
131 |
+
|
132 |
+
# this regex essentially accepts anything that looks like an arxiv id and has
|
133 |
+
# the slightest smell of being one as well. that is, if it is an id and
|
134 |
+
# mentions anything about the arxiv before hand, then it is an id.
|
135 |
+
REGEX_ARXIV_FLEXIBLE = (
|
136 |
+
r'(?:'
|
137 |
+
r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture
|
138 |
+
r')|(?:'
|
139 |
+
r'(?:'
|
140 |
+
r'(?:{})?'.format(RE_PREFIX_URL) +
|
141 |
+
r'(?:{})?'.format(RE_PREFIX_EPRINT) +
|
142 |
+
r'(?:'
|
143 |
+
r'(?:{})?'.format(RE_PREFIX_ARXIV) +
|
144 |
+
r'({})'.format(RE_ID_OLD) + # capture
|
145 |
+
r'|'
|
146 |
+
r'(?:{})'.format(RE_PREFIX_ARXIV) +
|
147 |
+
r'(?:{}/)?'.format(RE_CATEGORIES) +
|
148 |
+
r'({})'.format(RE_ID_NEW) + # capture
|
149 |
+
r')'
|
150 |
+
r')'
|
151 |
+
r'|'
|
152 |
+
r'(?:'
|
153 |
+
r'(?:{})|'.format(RE_PREFIX_URL) +
|
154 |
+
r'(?:{})|'.format(RE_PREFIX_EPRINT) +
|
155 |
+
r'(?:{})|'.format(RE_PREFIX_CATEGORIES) +
|
156 |
+
r'(?:{})'.format(RE_PREFIX_ARXIV) +
|
157 |
+
r')'
|
158 |
+
r'.*?'
|
159 |
+
r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture
|
160 |
+
r')|(?:'
|
161 |
+
r'(?:[\[\(]\s*)'
|
162 |
+
r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture
|
163 |
+
r'(?:\s*[\]\)])'
|
164 |
+
r')'
|
165 |
+
)
|
166 |
+
|
167 |
+
TEST_POSITIVE = [
|
168 |
+
'arXiv:quant-ph 1503.01017v3',
|
169 |
+
'math. RT/0903.2992',
|
170 |
+
'arXiv, 1511.03262',
|
171 |
+
'tions. arXiv preprint arXiv:1607.00021, 2016',
|
172 |
+
'Math. Phys. 255, 577 (2005), hep-th/0306165',
|
173 |
+
'Kuzovlev, arXiv:cond-mat/9903350 ',
|
174 |
+
'arXiv:math.RT/1206.5933,',
|
175 |
+
'arXiv e-prints 1306.1595',
|
176 |
+
'ays, JHEP 07 (2009) 055, [ 0903.0883]',
|
177 |
+
' Rev. D71 (2005) 063534, [ astro-ph/0501562]',
|
178 |
+
'e-print arXiv:1506.02215v1',
|
179 |
+
'available at: http://arxiv.org/abs/1511.08977',
|
180 |
+
'arXiv e-print: 1306.2144',
|
181 |
+
'Preprint arXiv:math/0612139',
|
182 |
+
'Vertices in a Digraph. arXiv preprint 1602.02129 ',
|
183 |
+
'cond-mat/0309488.'
|
184 |
+
'decays, 1701.01871 LHCB-PAPE',
|
185 |
+
'Distribution. In: 1404.2485v3 (2015)',
|
186 |
+
'113005 (2013), 1307.4331,',
|
187 |
+
'scalar quantum 1610.07877v1',
|
188 |
+
'cond-mat/0309488.'
|
189 |
+
'cond-mat/0309488.8383'
|
190 |
+
]
|
191 |
+
|
192 |
+
TEST_NEGATIVE = [
|
193 |
+
'doi: 10.1145/ 321105.321114 ',
|
194 |
+
'doi: 10.1145/ 1105.321114 ',
|
195 |
+
]
|
arxiv_public_data/s3_bulk_download.py
ADDED
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
s3_bulk_download.py
|
3 |
+
|
4 |
+
authors: Matt Bierbaum and Colin Clement
|
5 |
+
date: 2019-02-27
|
6 |
+
|
7 |
+
This module uses AWS to request a signed key url, which requests files
|
8 |
+
from the ArXiv S3 bucket. It then unpacks and converts the pdfs into text.
|
9 |
+
|
10 |
+
Note that at the time of writing the ArXiv manifest, it contains 1.15 TB
|
11 |
+
of PDFs, which would cost $103 to receive from AWS S3.
|
12 |
+
|
13 |
+
see: https://arxiv.org/help/bulk_data_s3
|
14 |
+
|
15 |
+
Usage
|
16 |
+
-----
|
17 |
+
|
18 |
+
Set DIR_FULLTEXT as the directory where the text parsed from pdfs should be placed.
|
19 |
+
Set DIR_PDFTARS as the directory where the raw pdf tars should be placed.
|
20 |
+
|
21 |
+
```
|
22 |
+
import arxiv_public_data.s3_bulk_download as s3
|
23 |
+
|
24 |
+
# Download manifest file (or load if already downloaded)
|
25 |
+
>>> manifest = s3.get_manifest()
|
26 |
+
|
27 |
+
# Download tar files and convert pdf to text
|
28 |
+
# Costs money! Will only download if it does not find files
|
29 |
+
>>> s3.process_manifest_files(manifest)
|
30 |
+
|
31 |
+
# If you just want to download the PDFs and not convert to text use
|
32 |
+
>>> s3.download_check_tarfiles(manifest)
|
33 |
+
```
|
34 |
+
"""
|
35 |
+
|
36 |
+
import os
|
37 |
+
import re
|
38 |
+
import gzip
|
39 |
+
import json
|
40 |
+
import glob
|
41 |
+
import shlex
|
42 |
+
import shutil
|
43 |
+
import tarfile
|
44 |
+
import boto3
|
45 |
+
import hashlib
|
46 |
+
import requests
|
47 |
+
import subprocess
|
48 |
+
|
49 |
+
from functools import partial
|
50 |
+
from multiprocessing import Pool
|
51 |
+
from collections import defaultdict
|
52 |
+
import xml.etree.ElementTree as ET
|
53 |
+
|
54 |
+
from arxiv_public_data import fulltext
|
55 |
+
from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER
|
56 |
+
|
57 |
+
logger = LOGGER.getChild('s3')
|
58 |
+
|
59 |
+
CHUNK_SIZE = 2**20 # 1MB
|
60 |
+
BUCKET_NAME = 'arxiv'
|
61 |
+
S3_PDF_MANIFEST = 'pdf/arXiv_pdf_manifest.xml'
|
62 |
+
S3_TEX_MANIFEST = 'src/arXiv_src_manifest.xml'
|
63 |
+
HEADERS = {'x-amz-request-payer': 'requester'}
|
64 |
+
|
65 |
+
s3 = boto3.client('s3', region_name='us-east-1')
|
66 |
+
|
67 |
+
def download_file(filename, outfile, chunk_size=CHUNK_SIZE, redownload=False,
|
68 |
+
dryrun=False):
|
69 |
+
"""
|
70 |
+
Downloads filename from the ArXiv AWS S3 bucket, and returns streaming md5
|
71 |
+
sum of the content
|
72 |
+
Parameters
|
73 |
+
----------
|
74 |
+
filename : str
|
75 |
+
KEY corresponding to AWS bucket file
|
76 |
+
outfile : stf
|
77 |
+
name and path of local file in which downloaded file will be stored
|
78 |
+
(optional)
|
79 |
+
chunk_size : int
|
80 |
+
requests byte streaming size (so 500MB are not stored in memory
|
81 |
+
prior to processing)
|
82 |
+
redownload : bool
|
83 |
+
Look to see if file is already downloaded, and simply return md5sum
|
84 |
+
if it it exists, unless redownload is True
|
85 |
+
dryrun : bool
|
86 |
+
If True, only log activity
|
87 |
+
Returns
|
88 |
+
-------
|
89 |
+
md5sum : str
|
90 |
+
md5 checksum of the contents of filename
|
91 |
+
"""
|
92 |
+
if os.path.exists(outfile) and not redownload:
|
93 |
+
md5 = hashlib.md5()
|
94 |
+
md5.update(gzip.open(outfile, 'rb').read())
|
95 |
+
return md5.hexdigest()
|
96 |
+
|
97 |
+
md5 = hashlib.md5()
|
98 |
+
url = s3.generate_presigned_url(
|
99 |
+
"get_object",
|
100 |
+
Params={
|
101 |
+
"Bucket": BUCKET_NAME, "Key": filename, "RequestPayer": 'requester'
|
102 |
+
}
|
103 |
+
)
|
104 |
+
if not dryrun:
|
105 |
+
logger.info('Requesting "{}" (costs money!)'.format(filename))
|
106 |
+
request = requests.get(url, stream=True)
|
107 |
+
response_iter = request.iter_content(chunk_size=chunk_size)
|
108 |
+
logger.info("\t Writing {}".format(outfile))
|
109 |
+
with gzip.open(outfile, 'wb') as fout:
|
110 |
+
for i, chunk in enumerate(response_iter):
|
111 |
+
fout.write(chunk)
|
112 |
+
md5.update(chunk)
|
113 |
+
else:
|
114 |
+
logger.info('Requesting "{}" (free!)'.format(filename))
|
115 |
+
logger.info("\t Writing {}".format(outfile))
|
116 |
+
return md5.hexdigest()
|
117 |
+
|
118 |
+
def default_manifest_filename():
|
119 |
+
return os.path.join(DIR_PDFTARS, 'arxiv-manifest.xml.gz')
|
120 |
+
|
121 |
+
def get_manifest(filename=None, redownload=False):
|
122 |
+
"""
|
123 |
+
Get the file manifest for the ArXiv
|
124 |
+
Parameters
|
125 |
+
----------
|
126 |
+
redownload : bool
|
127 |
+
If true, forces redownload of manifest even if it exists
|
128 |
+
Returns
|
129 |
+
-------
|
130 |
+
file_information : list of dicts
|
131 |
+
each dict contains the file metadata
|
132 |
+
"""
|
133 |
+
manifest_file = filename or default_manifest_filename()
|
134 |
+
md5 = download_file(
|
135 |
+
S3_PDF_MANIFEST, manifest_file, redownload=redownload, dryrun=False
|
136 |
+
)
|
137 |
+
manifest = gzip.open(manifest_file, 'rb').read()
|
138 |
+
return parse_manifest(manifest)
|
139 |
+
|
140 |
+
def parse_manifest(manifest):
|
141 |
+
"""
|
142 |
+
Parse the XML of the ArXiv manifest file.
|
143 |
+
|
144 |
+
Parameters
|
145 |
+
----------
|
146 |
+
manifest : str
|
147 |
+
xml string from the ArXiv manifest file
|
148 |
+
|
149 |
+
Returns
|
150 |
+
-------
|
151 |
+
file_information : list of dicts
|
152 |
+
One dict for each file, containing the filename, size, md5sum,
|
153 |
+
and other metadata
|
154 |
+
"""
|
155 |
+
root = ET.fromstring(manifest)
|
156 |
+
return [
|
157 |
+
{c.tag: f.find(c.tag).text for c in f.getchildren()}
|
158 |
+
for f in root.findall('file')
|
159 |
+
]
|
160 |
+
|
161 |
+
def _tar_to_filename(filename):
|
162 |
+
return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'
|
163 |
+
|
164 |
+
def download_check_tarfile(filename, md5_expected, dryrun=False, redownload=False):
|
165 |
+
""" Download filename, check its md5sum, and form the output path """
|
166 |
+
outname = _tar_to_filename(filename)
|
167 |
+
md5_downloaded = download_file(
|
168 |
+
filename, outname, dryrun=dryrun, redownload=redownload
|
169 |
+
)
|
170 |
+
|
171 |
+
if not dryrun:
|
172 |
+
if md5_expected != md5_downloaded:
|
173 |
+
msg = "MD5 '{}' does not match expected '{}' for file '{}'".format(
|
174 |
+
md5_downloaded, md5_expected, filename
|
175 |
+
)
|
176 |
+
raise AssertionError(msg)
|
177 |
+
|
178 |
+
return outname
|
179 |
+
|
180 |
+
def download_check_tarfiles(list_of_fileinfo, dryrun=False):
|
181 |
+
"""
|
182 |
+
Download tar files from the ArXiv manifest and check that their MD5sums
|
183 |
+
match
|
184 |
+
|
185 |
+
Parameters
|
186 |
+
----------
|
187 |
+
list_of_fileinfo : list
|
188 |
+
Some elements of results of get_manifest
|
189 |
+
(optional)
|
190 |
+
dryrun : bool
|
191 |
+
If True, only log activity
|
192 |
+
"""
|
193 |
+
for fileinfo in list_of_fileinfo:
|
194 |
+
download_check_tarfile(fileinfo['filename'], fileinfo['md5sum'], dryrun=dryrun)
|
195 |
+
|
196 |
+
def call(cmd, dryrun=False, debug=False):
|
197 |
+
""" Spawn a subprocess and execute the string in cmd """
|
198 |
+
if dryrun:
|
199 |
+
logger.info(cmd)
|
200 |
+
return 0
|
201 |
+
else:
|
202 |
+
return subprocess.check_call(
|
203 |
+
shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w')
|
204 |
+
)
|
205 |
+
|
206 |
+
def _make_pathname(filename):
|
207 |
+
"""
|
208 |
+
Make filename path for text document, sorted like on arXiv servers.
|
209 |
+
Parameters
|
210 |
+
----------
|
211 |
+
filename : str
|
212 |
+
string filename of arXiv article
|
213 |
+
(optional)
|
214 |
+
Returns
|
215 |
+
-------
|
216 |
+
pathname : str
|
217 |
+
pathname in which to store the article following
|
218 |
+
* Old ArXiv IDs: e.g. hep-ph0001001.txt returns
|
219 |
+
DIR_PDFTARS/hep-ph/0001/hep-ph0001001.txt
|
220 |
+
* New ArXiv IDs: e.g. 1501.13851.txt returns
|
221 |
+
DIR_PDFTARS/arxiv/1501/1501.13851.txt
|
222 |
+
"""
|
223 |
+
basename = os.path.basename(filename)
|
224 |
+
fname = os.path.splitext(basename)[0]
|
225 |
+
if '.' in fname: # new style ArXiv ID
|
226 |
+
yearmonth = fname.split('.')[0]
|
227 |
+
return os.path.join(DIR_FULLTEXT, 'arxiv', yearmonth, basename)
|
228 |
+
# old style ArXiv ID
|
229 |
+
cat, aid = re.split(r'(\d+)', fname)[:2]
|
230 |
+
yearmonth = aid[:4]
|
231 |
+
return os.path.join(DIR_FULLTEXT, cat, yearmonth, basename)
|
232 |
+
|
233 |
+
def process_tarfile_inner(filename, pdfnames=None, processes=1, dryrun=False,
|
234 |
+
timelimit=fulltext.TIMELIMIT):
|
235 |
+
outname = _tar_to_filename(filename)
|
236 |
+
|
237 |
+
if not os.path.exists(outname):
|
238 |
+
msg = 'Tarfile from manifest not found {}, skipping...'.format(outname)
|
239 |
+
logger.error(msg)
|
240 |
+
return
|
241 |
+
|
242 |
+
# unpack tar file
|
243 |
+
if pdfnames:
|
244 |
+
namelist = ' '.join(pdfnames)
|
245 |
+
cmd = 'tar --one-top-level -C {} -xf {} {}'
|
246 |
+
cmd = cmd.format(DIR_PDFTARS, outname, namelist)
|
247 |
+
else:
|
248 |
+
cmd = 'tar --one-top-level -C {} -xf {}'.format(DIR_PDFTARS, outname)
|
249 |
+
_call(cmd, dryrun)
|
250 |
+
|
251 |
+
basename = os.path.splitext(os.path.basename(filename))[0]
|
252 |
+
pdfdir = os.path.join(DIR_PDFTARS, basename, basename.split('_')[2])
|
253 |
+
|
254 |
+
# Run fulltext to convert pdfs in tardir into *.txt
|
255 |
+
converts = fulltext.convert_directory_parallel(
|
256 |
+
pdfdir, processes=processes, timelimit=timelimit
|
257 |
+
)
|
258 |
+
|
259 |
+
# move txt into final file structure
|
260 |
+
txtfiles = glob.glob('{}/*.txt'.format(pdfdir))
|
261 |
+
for tf in txtfiles:
|
262 |
+
mvfn = _make_pathname(tf)
|
263 |
+
dirname = os.path.dirname(mvfn)
|
264 |
+
if not os.path.exists(dirname):
|
265 |
+
_call('mkdir -p {}'.format(dirname), dryrun)
|
266 |
+
|
267 |
+
if not dryrun:
|
268 |
+
shutil.move(tf, mvfn)
|
269 |
+
|
270 |
+
# clean up pdfs
|
271 |
+
_call('rm -rf {}'.format(os.path.join(DIR_PDFTARS, basename)), dryrun)
|
272 |
+
|
273 |
+
def process_tarfile(fileinfo, pdfnames=None, dryrun=False, debug=False, processes=1):
|
274 |
+
"""
|
275 |
+
Download and process one of the tar files from the ArXiv manifest.
|
276 |
+
Download, unpack, and spawn the Docker image for converting pdf2text.
|
277 |
+
It will only try to download the file if it does not already exist.
|
278 |
+
|
279 |
+
The tar file will be stored in DIR_FULLTEXT/<fileinfo[filename](tar)> and the
|
280 |
+
resulting arXiv articles will be stored in the subdirectory
|
281 |
+
DIR_FULLTEXT/arxiv/<yearmonth>/<aid>.txt for old style arXiv IDs and
|
282 |
+
DIR_FULLTEXT/<category>/<yearmonth>/<aid>.txt for new style arXiv IDs.
|
283 |
+
|
284 |
+
Parameters
|
285 |
+
----------
|
286 |
+
fileinfo : dict
|
287 |
+
dictionary of file information from parse_manifest
|
288 |
+
(optional)
|
289 |
+
dryrun : bool
|
290 |
+
If True, only log activity
|
291 |
+
debug : bool
|
292 |
+
Silence stderr of Docker _call if debug is False
|
293 |
+
"""
|
294 |
+
filename = fileinfo['filename']
|
295 |
+
md5sum = fileinfo['md5sum']
|
296 |
+
|
297 |
+
if check_if_any_processed(fileinfo):
|
298 |
+
logger.info('Tar file appears processed, skipping {}...'.format(filename))
|
299 |
+
return
|
300 |
+
|
301 |
+
logger.info('Processing tar "{}" ...'.format(filename))
|
302 |
+
process_tarfile_inner(filename, pdfnames=None, processes=processes, dryrun=dryrun)
|
303 |
+
|
304 |
+
def process_manifest_files(list_of_fileinfo, processes=1, dryrun=False):
|
305 |
+
"""
|
306 |
+
Download PDFs from the ArXiv AWS S3 bucket and convert each pdf to text
|
307 |
+
Parameters. If files are already downloaded, it will only process them.
|
308 |
+
----------
|
309 |
+
list_of_fileinfo : list
|
310 |
+
Some elements of results of get_manifest
|
311 |
+
(optional)
|
312 |
+
processes : int
|
313 |
+
number of paralell workers to spawn (roughly as many CPUs as you have)
|
314 |
+
dryrun : bool
|
315 |
+
If True, only log activity
|
316 |
+
"""
|
317 |
+
for fileinfo in list_of_fileinfo:
|
318 |
+
process_tarfile(fileinfo, dryrun=dryrun, processes=processes)
|
319 |
+
|
320 |
+
def check_if_any_processed(fileinfo):
|
321 |
+
"""
|
322 |
+
Spot check a tarfile to see if the pdfs have been converted to text,
|
323 |
+
given an element of the s3 manifest
|
324 |
+
"""
|
325 |
+
first = _make_pathname(fileinfo['first_item']+'.txt')
|
326 |
+
last = _make_pathname(fileinfo['last_item']+'.txt')
|
327 |
+
return os.path.exists(first) and os.path.exists(last)
|
328 |
+
|
329 |
+
def generate_tarfile_indices(manifest):
|
330 |
+
"""
|
331 |
+
Go through the manifest and for every tarfile, get a list of the PDFs
|
332 |
+
that should be contained within it. This is a separate function because
|
333 |
+
even checking the tars is rather slow.
|
334 |
+
|
335 |
+
Returns
|
336 |
+
-------
|
337 |
+
index : dictionary
|
338 |
+
keys: tarfile, values: list of pdfs
|
339 |
+
"""
|
340 |
+
index = {}
|
341 |
+
|
342 |
+
for fileinfo in manifest:
|
343 |
+
name = fileinfo['filename']
|
344 |
+
logger.info("Indexing {}...".format(name))
|
345 |
+
|
346 |
+
tarname = os.path.join(DIR_PDFTARS, os.path.basename(name))+'.gz'
|
347 |
+
files = [i for i in tarfile.open(tarname).getnames() if i.endswith('.pdf')]
|
348 |
+
|
349 |
+
index[name] = files
|
350 |
+
return index
|
351 |
+
|
352 |
+
def check_missing_txt_files(index):
|
353 |
+
"""
|
354 |
+
Use the index file from `generate_tarfile_indices` to check which pdf->txt
|
355 |
+
conversions are outstanding.
|
356 |
+
"""
|
357 |
+
missing = defaultdict(list)
|
358 |
+
for tar, pdflist in index.items():
|
359 |
+
logger.info("Checking {}...".format(tar))
|
360 |
+
for pdf in pdflist:
|
361 |
+
txt = _make_pathname(pdf).replace('.pdf', '.txt')
|
362 |
+
|
363 |
+
if not os.path.exists(txt):
|
364 |
+
missing[tar].append(pdf)
|
365 |
+
|
366 |
+
return missing
|
367 |
+
|
368 |
+
def rerun_missing(missing, processes=1):
|
369 |
+
"""
|
370 |
+
Use the output of `check_missing_txt_files` to attempt to rerun the text
|
371 |
+
files which are missing from the conversion. There are various reasons
|
372 |
+
that they can fail.
|
373 |
+
"""
|
374 |
+
sort = list(reversed(
|
375 |
+
sorted([(k, v) for k, v in missing.items()], key=lambda x: len(x[1]))
|
376 |
+
))
|
377 |
+
|
378 |
+
for tar, names in sort:
|
379 |
+
logger.info("Running {} ({} to do)...".format(tar, len(names)))
|
380 |
+
process_tarfile_inner(
|
381 |
+
tar, pdfnames=names, processes=processes,
|
382 |
+
timelimit=5 * fulltext.TIMELIMIT
|
383 |
+
)
|
384 |
+
|
385 |
+
def process_missing(manifest, processes=1):
|
386 |
+
"""
|
387 |
+
Do the full process of figuring what is missing and running them
|
388 |
+
"""
|
389 |
+
indexfile = os.path.join(DIR_PDFTARS, 'manifest-index.json')
|
390 |
+
|
391 |
+
if not os.path.exists(indexfile):
|
392 |
+
index = generate_tarfile_indices(manifest)
|
393 |
+
json.dump(index, open(indexfile, 'w'))
|
394 |
+
|
395 |
+
index = json.load(open(indexfile))
|
396 |
+
missing = check_missing_txt_files(index)
|
397 |
+
rerun_missing(missing, processes=processes)
|
arxiv_public_data/slice_pdfs.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import shlex
|
4 |
+
from collections import defaultdict
|
5 |
+
|
6 |
+
from arxiv_public_data.config import DIR_FULLTEXT, DIR_PDFTARS, LOGGER
|
7 |
+
|
8 |
+
def id_to_tarpdf(n):
|
9 |
+
if '.' in n:
|
10 |
+
ym = n.split('.')[0]
|
11 |
+
return '{}/{}.pdf'.format(ym, n)
|
12 |
+
else:
|
13 |
+
ym = n.split('/')[1][:4]
|
14 |
+
return '{}/{}.pdf'.format(ym, n.replace('/', ''))
|
15 |
+
|
16 |
+
def _call(cmd, dryrun=False, debug=False):
|
17 |
+
""" Spawn a subprocess and execute the string in cmd """
|
18 |
+
return subprocess.check_call(
|
19 |
+
shlex.split(cmd), stderr=None if debug else open(os.devnull, 'w')
|
20 |
+
)
|
21 |
+
|
22 |
+
def _tar_to_filename(filename):
|
23 |
+
return os.path.join(DIR_PDFTARS, os.path.basename(filename)) + '.gz'
|
24 |
+
|
25 |
+
def extract_files(tarfile, pdfs, outdir):
|
26 |
+
"""
|
27 |
+
Extract the list of `pdfs` filenames from `tarfile` into the `outdir`
|
28 |
+
"""
|
29 |
+
filename = tarfile
|
30 |
+
namelist = ' '.join([id_to_tarpdf(i) for i in pdfs])
|
31 |
+
|
32 |
+
outname = _tar_to_filename(filename)
|
33 |
+
basename = os.path.splitext(os.path.basename(filename))[0]
|
34 |
+
tdir = os.path.join(DIR_PDFTARS, basename)
|
35 |
+
outpdfs = ' '.join([os.path.join(tdir, id_to_tarpdf(i)) for i in pdfs])
|
36 |
+
|
37 |
+
cmd0 = 'tar --one-top-level -C {} -xf {} {}'.format(DIR_PDFTARS, outname, namelist)
|
38 |
+
cmd1 = 'cp -a {} {}'.format(outpdfs, outdir)
|
39 |
+
cmd2 = 'rm -rf {}'.format(tdir)
|
40 |
+
|
41 |
+
_call(cmd0)
|
42 |
+
_call(cmd1)
|
43 |
+
_call(cmd2)
|
44 |
+
|
45 |
+
def call_list(ai, manifest):
|
46 |
+
"""
|
47 |
+
Convert a list of articles and the tar manifest into a dictionary
|
48 |
+
of the tarfiles and the pdfs needed from them.
|
49 |
+
"""
|
50 |
+
inv = {}
|
51 |
+
for tar, pdfs in manifest.items():
|
52 |
+
for pdf in pdfs:
|
53 |
+
inv[pdf] = tar
|
54 |
+
|
55 |
+
tars = defaultdict(list)
|
56 |
+
num = 0
|
57 |
+
for i in ai:
|
58 |
+
aid = i.get('id')
|
59 |
+
|
60 |
+
tar = id_to_tarpdf(aid)
|
61 |
+
if not tar in inv:
|
62 |
+
continue
|
63 |
+
tars[inv[id_to_tarpdf(aid)]].append(aid)
|
64 |
+
|
65 |
+
return tars
|
66 |
+
|
67 |
+
def extract_by_filter(oai, tarmanifest, func, outdir):
|
68 |
+
"""
|
69 |
+
User-facing function that deals extracts a section of articles from
|
70 |
+
the entire arxiv.
|
71 |
+
|
72 |
+
Parameters
|
73 |
+
----------
|
74 |
+
oai : list of dicts
|
75 |
+
The OAI metadata from `oai_metadata.load_metadata`
|
76 |
+
|
77 |
+
tarmanifest : list of dicts
|
78 |
+
Dictionary describing the S3 downloads, `s3_bulk_download.get_manifest`
|
79 |
+
|
80 |
+
func : function
|
81 |
+
Filter to apply to OAI metadata to get list of articles
|
82 |
+
|
83 |
+
outdir : string
|
84 |
+
Directory in which to place the PDFs and metadata for the slice
|
85 |
+
"""
|
86 |
+
articles = func(oai)
|
87 |
+
tarmap = call_list(articles, tarmanifest)
|
88 |
+
|
89 |
+
for tar, pdfs in tarmap.items():
|
90 |
+
extract_files(tar, pdfs, outdir=outdir)
|
91 |
+
|
92 |
+
with open(os.path.join(outdir, 'metadata.json'), 'w') as f:
|
93 |
+
json.dump(articles, f)
|
arxiv_public_data/tex2utf.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://github.com/arXiv/arxiv-base@32e6ad0
|
2 |
+
"""
|
3 |
+
Copyright 2017 Cornell University
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6 |
+
this software and associated documentation files (the "Software"), to deal in
|
7 |
+
the Software without restriction, including without limitation the rights to
|
8 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
9 |
+
of the Software, and to permit persons to whom the Software is furnished to do
|
10 |
+
so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
22 |
+
"""
|
23 |
+
|
24 |
+
"""Convert between TeX escapes and UTF8."""
|
25 |
+
import re
|
26 |
+
from typing import Pattern, Dict, Match
|
27 |
+
|
28 |
+
accents = {
|
29 |
+
# first accents with non-letter prefix, e.g. \'A
|
30 |
+
"'A": 0x00c1, "'C": 0x0106, "'E": 0x00c9, "'I": 0x00cd,
|
31 |
+
"'L": 0x0139, "'N": 0x0143, "'O": 0x00d3, "'R": 0x0154,
|
32 |
+
"'S": 0x015a, "'U": 0x00da, "'Y": 0x00dd, "'Z": 0x0179,
|
33 |
+
"'a": 0x00e1, "'c": 0x0107, "'e": 0x00e9, "'i": 0x00ed,
|
34 |
+
"'l": 0x013a, "'n": 0x0144, "'o": 0x00f3, "'r": 0x0155,
|
35 |
+
"'s": 0x015b, "'u": 0x00fa, "'y": 0x00fd, "'z": 0x017a,
|
36 |
+
'"A': 0x00c4, '"E': 0x00cb, '"I': 0x00cf, '"O': 0x00d6,
|
37 |
+
'"U': 0x00dc, '"Y': 0x0178, '"a': 0x00e4, '"e': 0x00eb,
|
38 |
+
'"i': 0x00ef, '"o': 0x00f6, '"u': 0x00fc, '"y': 0x00ff,
|
39 |
+
'.A': 0x0226, '.C': 0x010a, '.E': 0x0116, '.G': 0x0120,
|
40 |
+
'.I': 0x0130, '.O': 0x022e, '.Z': 0x017b, '.a': 0x0227,
|
41 |
+
'.c': 0x010b, '.e': 0x0117, '.g': 0x0121, '.o': 0x022f,
|
42 |
+
'.z': 0x017c, '=A': 0x0100, '=E': 0x0112, '=I': 0x012a,
|
43 |
+
'=O': 0x014c, '=U': 0x016a, '=Y': 0x0232, '=a': 0x0101,
|
44 |
+
'=e': 0x0113, '=i': 0x012b, '=o': 0x014d, '=u': 0x016b,
|
45 |
+
'=y': 0x0233, '^A': 0x00c2, '^C': 0x0108, '^E': 0x00ca,
|
46 |
+
'^G': 0x011c, '^H': 0x0124, '^I': 0x00ce, '^J': 0x0134,
|
47 |
+
'^O': 0x00d4, '^S': 0x015c, '^U': 0x00db, '^W': 0x0174,
|
48 |
+
'^Y': 0x0176, '^a': 0x00e2, '^c': 0x0109, '^e': 0x00ea,
|
49 |
+
'^g': 0x011d, '^h': 0x0125, '^i': 0x00ee, '^j': 0x0135,
|
50 |
+
'^o': 0x00f4, '^s': 0x015d, '^u': 0x00fb, '^w': 0x0175,
|
51 |
+
'^y': 0x0177, '`A': 0x00c0, '`E': 0x00c8, '`I': 0x00cc,
|
52 |
+
'`O': 0x00d2, '`U': 0x00d9, '`a': 0x00e0, '`e': 0x00e8,
|
53 |
+
'`i': 0x00ec, '`o': 0x00f2, '`u': 0x00f9, '~A': 0x00c3,
|
54 |
+
'~I': 0x0128, '~N': 0x00d1, '~O': 0x00d5, '~U': 0x0168,
|
55 |
+
'~a': 0x00e3, '~i': 0x0129, '~n': 0x00f1, '~o': 0x00f5,
|
56 |
+
'~u': 0x0169,
|
57 |
+
# and now ones with letter prefix \c{c} etc..
|
58 |
+
'HO': 0x0150, 'HU': 0x0170, 'Ho': 0x0151, 'Hu': 0x0171,
|
59 |
+
'cC': 0x00c7, 'cE': 0x0228,
|
60 |
+
'cG': 0x0122, 'cK': 0x0136, 'cL': 0x013b, 'cN': 0x0145,
|
61 |
+
'cR': 0x0156, 'cS': 0x015e, 'cT': 0x0162, 'cc': 0x00e7,
|
62 |
+
'ce': 0x0229, 'cg': 0x0123, 'ck': 0x0137, 'cl': 0x013c,
|
63 |
+
# Commented out due ARXIVDEV-2322 (bug reported by PG)
|
64 |
+
# 'ci' : 'i\x{0327}' = chr(0x69).ch(0x327) # i with combining cedilla
|
65 |
+
'cn': 0x0146, 'cr': 0x0157, 'cs': 0x015f, 'ct': 0x0163,
|
66 |
+
'kA': 0x0104, 'kE': 0x0118, 'kI': 0x012e, 'kO': 0x01ea,
|
67 |
+
'kU': 0x0172, 'ka': 0x0105, 'ke': 0x0119, 'ki': 0x012f,
|
68 |
+
'ko': 0x01eb, 'ku': 0x0173, 'rA': 0x00c5, 'rU': 0x016e,
|
69 |
+
'ra': 0x00e5, 'ru': 0x016f, 'uA': 0x0102, 'uE': 0x0114,
|
70 |
+
'uG': 0x011e, 'uI': 0x012c, 'uO': 0x014e, 'uU': 0x016c,
|
71 |
+
'ua': 0x0103, 'ue': 0x0115, 'ug': 0x011f,
|
72 |
+
'ui': 0x012d, 'uo': 0x014f, 'uu': 0x016d,
|
73 |
+
'vA': 0x01cd, 'vC': 0x010c, 'vD': 0x010e,
|
74 |
+
'vE': 0x011a, 'vG': 0x01e6, 'vH': 0x021e, 'vI': 0x01cf,
|
75 |
+
'vK': 0x01e8, 'vL': 0x013d, 'vN': 0x0147, 'vO': 0x01d1,
|
76 |
+
'vR': 0x0158, 'vS': 0x0160, 'vT': 0x0164, 'vU': 0x01d3,
|
77 |
+
'vZ': 0x017d, 'va': 0x01ce, 'vc': 0x010d, 'vd': 0x010f,
|
78 |
+
've': 0x011b, 'vg': 0x01e7, 'vh': 0x021f, 'vi': 0x01d0,
|
79 |
+
'vk': 0x01e9, 'vl': 0x013e, 'vn': 0x0148, 'vo': 0x01d2,
|
80 |
+
'vr': 0x0159, 'vs': 0x0161, 'vt': 0x0165, 'vu': 0x01d4,
|
81 |
+
'vz': 0x017e
|
82 |
+
}
|
83 |
+
r"""
|
84 |
+
Hash to lookup tex markup and convert to Unicode.
|
85 |
+
|
86 |
+
macron: a line above character (overbar \={} in TeX)
|
87 |
+
caron: v-shape above character (\v{ } in TeX)
|
88 |
+
See: http://www.unicode.org/charts/
|
89 |
+
|
90 |
+
"""
|
91 |
+
|
92 |
+
textlet = {
|
93 |
+
'AA': 0x00c5, 'AE': 0x00c6, 'DH': 0x00d0, 'DJ': 0x0110,
|
94 |
+
'ETH': 0x00d0, 'L': 0x0141, 'NG': 0x014a, 'O': 0x00d8,
|
95 |
+
'oe': 0x0153, 'OE': 0x0152, 'TH': 0x00de, 'aa': 0x00e5,
|
96 |
+
'ae': 0x00e6,
|
97 |
+
'dh': 0x00f0, 'dj': 0x0111, 'eth': 0x00f0, 'i': 0x0131,
|
98 |
+
'l': 0x0142, 'ng': 0x014b, 'o': 0x00f8, 'ss': 0x00df,
|
99 |
+
'th': 0x00fe,
|
100 |
+
# Greek (upper)
|
101 |
+
'Gamma': 0x0393, 'Delta': 0x0394, 'Theta': 0x0398,
|
102 |
+
'Lambda': 0x039b, 'Xi': 0x039E, 'Pi': 0x03a0,
|
103 |
+
'Sigma': 0x03a3, 'Upsilon': 0x03a5, 'Phi': 0x03a6,
|
104 |
+
'Psi': 0x03a8, 'Omega': 0x03a9,
|
105 |
+
# Greek (lower)
|
106 |
+
'alpha': 0x03b1, 'beta': 0x03b2, 'gamma': 0x03b3,
|
107 |
+
'delta': 0x03b4, 'epsilon': 0x03b5, 'zeta': 0x03b6,
|
108 |
+
'eta': 0x03b7, 'theta': 0x03b8, 'iota': 0x03b9,
|
109 |
+
'kappa': 0x03ba, 'lambda': 0x03bb, 'mu': 0x03bc,
|
110 |
+
'nu': 0x03bd, 'xi': 0x03be, 'omicron': 0x03bf,
|
111 |
+
'pi': 0x03c0, 'rho': 0x03c1, 'varsigma': 0x03c2,
|
112 |
+
'sigma': 0x03c3, 'tau': 0x03c4, 'upsion': 0x03c5,
|
113 |
+
'varphi': 0x03C6, # φ
|
114 |
+
'phi': 0x03D5, # ϕ
|
115 |
+
'chi': 0x03c7, 'psi': 0x03c8, 'omega': 0x03c9,
|
116 |
+
}
|
117 |
+
|
118 |
+
|
119 |
+
def _p_to_match(tex_to_chr: Dict[str, int]) -> Pattern:
|
120 |
+
# textsym and textlet both use the same sort of regex pattern.
|
121 |
+
keys = r'\\(' + '|'.join(tex_to_chr.keys()) + ')'
|
122 |
+
pstr = r'({)?' + keys + r'(\b|(?=_))(?(1)}|(\\(?= )| |{}|)?)'
|
123 |
+
return re.compile(pstr)
|
124 |
+
|
125 |
+
|
126 |
+
textlet_pattern = _p_to_match(textlet)
|
127 |
+
|
128 |
+
textsym = {
|
129 |
+
'P': 0x00b6, 'S': 0x00a7, 'copyright': 0x00a9,
|
130 |
+
'guillemotleft': 0x00ab, 'guillemotright': 0x00bb,
|
131 |
+
'pounds': 0x00a3, 'dag': 0x2020, 'ddag': 0x2021,
|
132 |
+
'div': 0x00f7, 'deg': 0x00b0}
|
133 |
+
|
134 |
+
textsym_pattern = _p_to_match(textsym)
|
135 |
+
|
136 |
+
|
137 |
+
def _textlet_sub(match: Match) -> str:
|
138 |
+
return chr(textlet[match.group(2)])
|
139 |
+
|
140 |
+
|
141 |
+
def _textsym_sub(match: Match) -> str:
|
142 |
+
return chr(textsym[match.group(2)])
|
143 |
+
|
144 |
+
|
145 |
+
def texch2UTF(acc: str) -> str:
|
146 |
+
"""Convert single character TeX accents to UTF-8.
|
147 |
+
|
148 |
+
Strip non-whitepsace characters from any sequence not recognized (hence
|
149 |
+
could return an empty string if there are no word characters in the input
|
150 |
+
string).
|
151 |
+
|
152 |
+
chr(num) will automatically create a UTF8 string for big num
|
153 |
+
"""
|
154 |
+
if acc in accents:
|
155 |
+
return chr(accents[acc])
|
156 |
+
else:
|
157 |
+
return re.sub(r'[^\w]+', '', acc, flags=re.IGNORECASE)
|
158 |
+
|
159 |
+
|
160 |
+
def tex2utf(tex: str, letters: bool = True) -> str:
|
161 |
+
r"""Convert some TeX accents and greek symbols to UTF-8 characters.
|
162 |
+
|
163 |
+
:param tex: Text to filter.
|
164 |
+
|
165 |
+
:param letters: If False, do not convert greek letters or
|
166 |
+
ligatures. Greek symbols can cause problems. Ex. \phi is not
|
167 |
+
suppose to look like φ. φ looks like \varphi. See ARXIVNG-1612
|
168 |
+
|
169 |
+
:returns: string, possibly with some TeX replaced with UTF8
|
170 |
+
|
171 |
+
"""
|
172 |
+
# Do dotless i,j -> plain i,j where they are part of an accented i or j
|
173 |
+
utf = re.sub(r"/(\\['`\^\"\~\=\.uvH])\{\\([ij])\}", r"\g<1>\{\g<2>\}", tex)
|
174 |
+
|
175 |
+
# Now work on the Tex sequences, first those with letters only match
|
176 |
+
if letters:
|
177 |
+
utf = textlet_pattern.sub(_textlet_sub, utf)
|
178 |
+
|
179 |
+
utf = textsym_pattern.sub(_textsym_sub, utf)
|
180 |
+
|
181 |
+
utf = re.sub(r'\{\\j\}|\\j\s', 'j', utf) # not in Unicode?
|
182 |
+
|
183 |
+
# reduce {{x}}, {{{x}}}, ... down to {x}
|
184 |
+
while re.search(r'\{\{([^\}]*)\}\}', utf):
|
185 |
+
utf = re.sub(r'\{\{([^\}]*)\}\}', r'{\g<1>}', utf)
|
186 |
+
|
187 |
+
# Accents which have a non-letter prefix in TeX, first \'e
|
188 |
+
utf = re.sub(r'\\([\'`^"~=.][a-zA-Z])',
|
189 |
+
lambda m: texch2UTF(m.group(1)), utf)
|
190 |
+
|
191 |
+
# then \'{e} form:
|
192 |
+
utf = re.sub(r'\\([\'`^"~=.])\{([a-zA-Z])\}',
|
193 |
+
lambda m: texch2UTF(m.group(1) + m.group(2)), utf)
|
194 |
+
|
195 |
+
# Accents which have a letter prefix in TeX
|
196 |
+
# \u{x} u above (breve), \v{x} v above (caron), \H{x} double accute...
|
197 |
+
utf = re.sub(r'\\([Hckoruv])\{([a-zA-Z])\}',
|
198 |
+
lambda m: texch2UTF(m.group(1) + m.group(2)), utf)
|
199 |
+
|
200 |
+
# Don't do \t{oo} yet,
|
201 |
+
utf = re.sub(r'\\t{([^\}])\}', r'\g<1>', utf)
|
202 |
+
|
203 |
+
# bdc34: commented out in original Perl
|
204 |
+
# $utf =~ s/\{(.)\}/$1/g; # remove { } from around {x}
|
205 |
+
|
206 |
+
return utf
|
logo.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
boto3==1.9.118
|
2 |
+
requests==2.20.0
|
3 |
+
unicodedata2
|
4 |
+
https://github.com/jaepil/pdfminer3k/archive/1.0.4.zip
|
5 |
+
sentence-transformers
|
6 |
+
pdftotext
|
7 |
+
arxiv
|
8 |
+
arxiv2bib
|
9 |
+
scholarly
|
10 |
+
PyMuPDF
|
11 |
+
Pillow
|
12 |
+
tabula-py
|
13 |
+
sentencepiece
|
14 |
+
keybert
|
15 |
+
spacy[all]
|
16 |
+
scispacy
|
17 |
+
amrlib
|
18 |
+
transformers # >2.2.0
|
19 |
+
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz
|
20 |
+
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
|
21 |
+
bert-extractive-summarizer
|
22 |
+
streamlit
|
setup.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import setuptools
|
2 |
+
|
3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
4 |
+
long_description = fh.read()
|
5 |
+
|
6 |
+
setuptools.setup(
|
7 |
+
name="Auto-Research",
|
8 |
+
version="1.0",
|
9 |
+
author="Sidharth Pal",
|
10 |
+
author_email="sidharth.pal1992@gmail.com",
|
11 |
+
description="Geberate scientific survey with just a query",
|
12 |
+
long_description=long_description,
|
13 |
+
long_description_content_type="text/markdown",
|
14 |
+
url="https://github.com/sidphbot/Auto-Research",
|
15 |
+
project_urls={
|
16 |
+
"Docs" : "https://github.com/example/example/README.md",
|
17 |
+
"Bug Tracker": "https://github.com/sidphbot/Auto-Research/issues",
|
18 |
+
"Demo": "https://www.kaggle.com/sidharthpal/auto-research-generate-survey-from-query",
|
19 |
+
},
|
20 |
+
classifiers=[
|
21 |
+
"Development Status :: 5 - Production/Stable",
|
22 |
+
"Environment :: Console",
|
23 |
+
"Environment :: Other Environment",
|
24 |
+
"Intended Audience :: Developers",
|
25 |
+
"Intended Audience :: Education",
|
26 |
+
"Intended Audience :: Science/Research",
|
27 |
+
"Intended Audience :: Other Audience",
|
28 |
+
"Topic :: Education",
|
29 |
+
"Topic :: Education :: Computer Aided Instruction (CAI)",
|
30 |
+
"Topic :: Scientific/Engineering",
|
31 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
32 |
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
33 |
+
"Topic :: Scientific/Engineering :: Medical Science Apps.",
|
34 |
+
"Topic :: Scientific/Engineering :: Physics",
|
35 |
+
"Natural Language :: English",
|
36 |
+
"License :: OSI Approved :: GNU General Public License (GPL)",
|
37 |
+
"License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
|
38 |
+
"License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
|
39 |
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
40 |
+
"Operating System :: POSIX :: Linux",
|
41 |
+
"Operating System :: MacOS :: MacOS X",
|
42 |
+
"Environment :: GPU",
|
43 |
+
"Environment :: GPU :: NVIDIA CUDA",
|
44 |
+
"Programming Language :: Python",
|
45 |
+
"Programming Language :: Python :: 3",
|
46 |
+
"Programming Language :: Python :: 3 :: Only",
|
47 |
+
"Programming Language :: Python :: 3.6",
|
48 |
+
],
|
49 |
+
package_dir={"": "src"},
|
50 |
+
packages=setuptools.find_packages(where="src"),
|
51 |
+
python_requires=">=3.7",
|
52 |
+
install_requires=[
|
53 |
+
"pip",
|
54 |
+
"boto3==1.9.118",
|
55 |
+
"requests==2.20.0",
|
56 |
+
"unicodedata2",
|
57 |
+
"pdfminer3k",
|
58 |
+
"sentence-transformers",
|
59 |
+
"pdftotext",
|
60 |
+
"arxiv",
|
61 |
+
"arxiv2bib",
|
62 |
+
"scholarly",
|
63 |
+
"PyMuPDF",
|
64 |
+
"Pillow",
|
65 |
+
"tabula-py",
|
66 |
+
"sentencepiece",
|
67 |
+
"keybert",
|
68 |
+
"scispacy",
|
69 |
+
"amrlib",
|
70 |
+
"transformers",
|
71 |
+
"en_core_sci_scibert",
|
72 |
+
"bert-extractive-summarizer",
|
73 |
+
"en_core_sci_lg",
|
74 |
+
],
|
75 |
+
extras_require={
|
76 |
+
"spacy": ["all"],
|
77 |
+
},
|
78 |
+
dependency_links=[
|
79 |
+
"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz#egg=en_core_sci_scibert-0.5.0",
|
80 |
+
"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz#egg=en_core_sci_lg-0.5.0"
|
81 |
+
],
|
82 |
+
tests_require=["pytest"],
|
83 |
+
entry_points={
|
84 |
+
'console_scripts': [
|
85 |
+
'cursive = src.Surveyor:main',
|
86 |
+
],
|
87 |
+
},
|
88 |
+
|
89 |
+
)
|
src/Auto_Research.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: Auto-Research
|
3 |
+
Version: 1.0
|
4 |
+
Summary: Geberate scientific survey with just a query
|
5 |
+
Home-page: https://github.com/sidphbot/Auto-Research
|
6 |
+
Author: Sidharth Pal
|
7 |
+
Author-email: sidharth.pal1992@gmail.com
|
8 |
+
License: UNKNOWN
|
9 |
+
Project-URL: Docs, https://github.com/example/example/README.md
|
10 |
+
Project-URL: Bug Tracker, https://github.com/sidphbot/Auto-Research/issues
|
11 |
+
Project-URL: Demo, https://www.kaggle.com/sidharthpal/auto-research-generate-survey-from-query
|
12 |
+
Platform: UNKNOWN
|
13 |
+
Classifier: Development Status :: 5 - Production/Stable
|
14 |
+
Classifier: Environment :: Console
|
15 |
+
Classifier: Environment :: Other Environment
|
16 |
+
Classifier: Intended Audience :: Developers
|
17 |
+
Classifier: Intended Audience :: Education
|
18 |
+
Classifier: Intended Audience :: Science/Research
|
19 |
+
Classifier: Intended Audience :: Other Audience
|
20 |
+
Classifier: Topic :: Education
|
21 |
+
Classifier: Topic :: Education :: Computer Aided Instruction (CAI)
|
22 |
+
Classifier: Topic :: Scientific/Engineering
|
23 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
24 |
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
25 |
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
26 |
+
Classifier: Topic :: Scientific/Engineering :: Physics
|
27 |
+
Classifier: Natural Language :: English
|
28 |
+
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
|
29 |
+
Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)
|
30 |
+
Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)
|
31 |
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
32 |
+
Classifier: Operating System :: POSIX :: Linux
|
33 |
+
Classifier: Operating System :: MacOS :: MacOS X
|
34 |
+
Classifier: Environment :: GPU
|
35 |
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
36 |
+
Classifier: Programming Language :: Python
|
37 |
+
Classifier: Programming Language :: Python :: 3
|
38 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
39 |
+
Classifier: Programming Language :: Python :: 3.6
|
40 |
+
Requires-Python: >=3.7
|
41 |
+
Description-Content-Type: text/markdown
|
42 |
+
Provides-Extra: spacy
|
43 |
+
License-File: LICENSE
|
44 |
+
|
45 |
+
# Auto-Research
|
46 |
+
![Auto-Research][logo]
|
47 |
+
|
48 |
+
[logo]: https://github.com/sidphbot/Auto-Research/blob/main/logo.png
|
49 |
+
A no-code utility to generate a detailed well-cited survey with topic clustered sections (draft paper format) and other interesting artifacts from a single research query.
|
50 |
+
|
51 |
+
Data Provider: [arXiv](https://arxiv.org/) Open Archive Initiative OAI
|
52 |
+
|
53 |
+
Requirements:
|
54 |
+
- python 3.7 or above
|
55 |
+
- poppler-utils - `sudo apt-get install build-essential libpoppler-cpp-dev pkg-config python-dev`
|
56 |
+
- list of requirements in requirements.txt - `cat requirements.txt | xargs pip install`
|
57 |
+
- 8GB disk space
|
58 |
+
- 13GB CUDA(GPU) memory - for a survey of 100 searched papers(max_search) and 25 selected papers(num_papers)
|
59 |
+
|
60 |
+
#### Demo :
|
61 |
+
|
62 |
+
Video Demo : https://drive.google.com/file/d/1-77J2L10lsW-bFDOGdTaPzSr_utY743g/view?usp=sharing
|
63 |
+
|
64 |
+
Kaggle Re-usable Demo : https://www.kaggle.com/sidharthpal/auto-research-generate-survey-from-query
|
65 |
+
|
66 |
+
(`[TIP]` click 'edit and run' to run the demo for your custom queries on a free GPU)
|
67 |
+
|
68 |
+
|
69 |
+
#### Steps to run (pip coming soon):
|
70 |
+
```
|
71 |
+
apt install -y poppler-utils libpoppler-cpp-dev
|
72 |
+
git clone https://github.com/sidphbot/Auto-Research.git
|
73 |
+
|
74 |
+
cd Auto-Research/
|
75 |
+
pip install -r requirements.txt
|
76 |
+
python survey.py [options] <your_research_query>
|
77 |
+
```
|
78 |
+
|
79 |
+
#### Artifacts generated (zipped):
|
80 |
+
- Detailed survey draft paper as txt file
|
81 |
+
- A curated list of top 25+ papers as pdfs and txts
|
82 |
+
- Images extracted from above papers as jpegs, bmps etc
|
83 |
+
- Heading/Section wise highlights extracted from above papers as a re-usable pure python joblib dump
|
84 |
+
- Tables extracted from papers(optional)
|
85 |
+
- Corpus of metadata highlights/text of top 100 papers as a re-usable pure python joblib dump
|
86 |
+
|
87 |
+
## Example run #1 - python utility
|
88 |
+
|
89 |
+
```
|
90 |
+
python survey.py 'multi-task representation learning'
|
91 |
+
```
|
92 |
+
|
93 |
+
## Example run #2 - python class
|
94 |
+
|
95 |
+
```
|
96 |
+
from survey import Surveyor
|
97 |
+
mysurveyor = Surveyor()
|
98 |
+
mysurveyor.survey('quantum entanglement')
|
99 |
+
```
|
100 |
+
|
101 |
+
### Research tools:
|
102 |
+
|
103 |
+
These are independent tools for your research or document text handling needs.
|
104 |
+
|
105 |
+
```
|
106 |
+
*[Tip]* :(models can be changed in defaults or passed on during init along with `refresh-models=True`)
|
107 |
+
```
|
108 |
+
|
109 |
+
- `abstractive_summary` - takes a long text document (`string`) and returns a 1-paragraph abstract or “abstractive” summary (`string`)
|
110 |
+
|
111 |
+
Input:
|
112 |
+
|
113 |
+
`longtext` : string
|
114 |
+
|
115 |
+
Returns:
|
116 |
+
|
117 |
+
`summary` : string
|
118 |
+
|
119 |
+
- `extractive_summary` - takes a long text document (`string`) and returns a 1-paragraph of extracted highlights or “extractive” summary (`string`)
|
120 |
+
|
121 |
+
Input:
|
122 |
+
|
123 |
+
`longtext` : string
|
124 |
+
|
125 |
+
Returns:
|
126 |
+
|
127 |
+
`summary` : string
|
128 |
+
|
129 |
+
- `generate_title` - takes a long text document (`string`) and returns a generated title (`string`)
|
130 |
+
|
131 |
+
Input:
|
132 |
+
|
133 |
+
`longtext` : string
|
134 |
+
|
135 |
+
Returns:
|
136 |
+
|
137 |
+
`title` : string
|
138 |
+
|
139 |
+
- `extractive_highlights` - takes a long text document (`string`) and returns a list of extracted highlights (`[string]`), a list of keywords (`[string]`) and key phrases (`[string]`)
|
140 |
+
|
141 |
+
Input:
|
142 |
+
|
143 |
+
`longtext` : string
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
|
147 |
+
`highlights` : [string]
|
148 |
+
`keywords` : [string]
|
149 |
+
`keyphrases` : [string]
|
150 |
+
|
151 |
+
- `extract_images_from_file` - takes a pdf file name (`string`) and returns a list of image filenames (`[string]`).
|
152 |
+
|
153 |
+
Input:
|
154 |
+
|
155 |
+
`pdf_file` : string
|
156 |
+
|
157 |
+
Returns:
|
158 |
+
|
159 |
+
`images_files` : [string]
|
160 |
+
|
161 |
+
- `extract_tables_from_file` - takes a pdf file name (`string`) and returns a list of csv filenames (`[string]`).
|
162 |
+
|
163 |
+
Input:
|
164 |
+
|
165 |
+
`pdf_file` : string
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
|
169 |
+
`images_files` : [string]
|
170 |
+
|
171 |
+
- `cluster_lines` - takes a list of lines (`string`) and returns the topic-clustered sections (`dict(generated_title: [cluster_abstract])`) and clustered lines (`dict(cluster_id: [cluster_lines])`)
|
172 |
+
|
173 |
+
Input:
|
174 |
+
|
175 |
+
`lines` : [string]
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
|
179 |
+
`sections` : dict(generated_title: [cluster_abstract])
|
180 |
+
`clusters` : dict(cluster_id: [cluster_lines])
|
181 |
+
|
182 |
+
- `extract_headings` - *[for scientific texts - Assumes an ‘abstract’ heading present]* takes a text file name (`string`) and returns a list of headings (`[string]`) and refined lines (`[string]`).
|
183 |
+
|
184 |
+
`[Tip 1]` : Use `extract_sections` as a wrapper (e.g. `extract_sections(extract_headings(“/path/to/textfile”)`) to get heading-wise sectioned text with refined lines instead (`dict( heading: text)`)
|
185 |
+
|
186 |
+
`[Tip 2]` : write the word ‘abstract’ at the start of the file text to get an extraction for non-scientific texts as well !!
|
187 |
+
|
188 |
+
Input:
|
189 |
+
|
190 |
+
`text_file` : string
|
191 |
+
|
192 |
+
Returns:
|
193 |
+
|
194 |
+
`refined` : [string],
|
195 |
+
`headings` : [string]
|
196 |
+
`sectioned_doc` : dict( heading: text) (Optional - Wrapper case)
|
197 |
+
|
198 |
+
|
199 |
+
## Access/Modify defaults:
|
200 |
+
|
201 |
+
- inside code
|
202 |
+
```
|
203 |
+
from survey.Surveyor import DEFAULTS
|
204 |
+
from pprint import pprint
|
205 |
+
|
206 |
+
pprint(DEFAULTS)
|
207 |
+
```
|
208 |
+
or,
|
209 |
+
|
210 |
+
- Modify static config file - `defaults.py`
|
211 |
+
|
212 |
+
or,
|
213 |
+
|
214 |
+
- At runtime (utility)
|
215 |
+
|
216 |
+
```
|
217 |
+
python survey.py --help
|
218 |
+
```
|
219 |
+
```
|
220 |
+
usage: survey.py [-h] [--max_search max_metadata_papers]
|
221 |
+
[--num_papers max_num_papers] [--pdf_dir pdf_dir]
|
222 |
+
[--txt_dir txt_dir] [--img_dir img_dir] [--tab_dir tab_dir]
|
223 |
+
[--dump_dir dump_dir] [--models_dir save_models_dir]
|
224 |
+
[--title_model_name title_model_name]
|
225 |
+
[--ex_summ_model_name extractive_summ_model_name]
|
226 |
+
[--ledmodel_name ledmodel_name]
|
227 |
+
[--embedder_name sentence_embedder_name]
|
228 |
+
[--nlp_name spacy_model_name]
|
229 |
+
[--similarity_nlp_name similarity_nlp_name]
|
230 |
+
[--kw_model_name kw_model_name]
|
231 |
+
[--refresh_models refresh_models] [--high_gpu high_gpu]
|
232 |
+
query_string
|
233 |
+
|
234 |
+
Generate a survey just from a query !!
|
235 |
+
|
236 |
+
positional arguments:
|
237 |
+
query_string your research query/keywords
|
238 |
+
|
239 |
+
optional arguments:
|
240 |
+
-h, --help show this help message and exit
|
241 |
+
--max_search max_metadata_papers
|
242 |
+
maximium number of papers to gaze at - defaults to 100
|
243 |
+
--num_papers max_num_papers
|
244 |
+
maximium number of papers to download and analyse -
|
245 |
+
defaults to 25
|
246 |
+
--pdf_dir pdf_dir pdf paper storage directory - defaults to
|
247 |
+
arxiv_data/tarpdfs/
|
248 |
+
--txt_dir txt_dir text-converted paper storage directory - defaults to
|
249 |
+
arxiv_data/fulltext/
|
250 |
+
--img_dir img_dir image storage directory - defaults to
|
251 |
+
arxiv_data/images/
|
252 |
+
--tab_dir tab_dir tables storage directory - defaults to
|
253 |
+
arxiv_data/tables/
|
254 |
+
--dump_dir dump_dir all_output_dir - defaults to arxiv_dumps/
|
255 |
+
--models_dir save_models_dir
|
256 |
+
directory to save models (> 5GB) - defaults to
|
257 |
+
saved_models/
|
258 |
+
--title_model_name title_model_name
|
259 |
+
title model name/tag in hugging-face, defaults to
|
260 |
+
'Callidior/bert2bert-base-arxiv-titlegen'
|
261 |
+
--ex_summ_model_name extractive_summ_model_name
|
262 |
+
extractive summary model name/tag in hugging-face,
|
263 |
+
defaults to 'allenai/scibert_scivocab_uncased'
|
264 |
+
--ledmodel_name ledmodel_name
|
265 |
+
led model(for abstractive summary) name/tag in
|
266 |
+
hugging-face, defaults to 'allenai/led-
|
267 |
+
large-16384-arxiv'
|
268 |
+
--embedder_name sentence_embedder_name
|
269 |
+
sentence embedder name/tag in hugging-face, defaults
|
270 |
+
to 'paraphrase-MiniLM-L6-v2'
|
271 |
+
--nlp_name spacy_model_name
|
272 |
+
spacy model name/tag in hugging-face (if changed -
|
273 |
+
needs to be spacy-installed prior), defaults to
|
274 |
+
'en_core_sci_scibert'
|
275 |
+
--similarity_nlp_name similarity_nlp_name
|
276 |
+
spacy downstream model(for similarity) name/tag in
|
277 |
+
hugging-face (if changed - needs to be spacy-installed
|
278 |
+
prior), defaults to 'en_core_sci_lg'
|
279 |
+
--kw_model_name kw_model_name
|
280 |
+
keyword extraction model name/tag in hugging-face,
|
281 |
+
defaults to 'distilbert-base-nli-mean-tokens'
|
282 |
+
--refresh_models refresh_models
|
283 |
+
Refresh model downloads with given names (needs
|
284 |
+
atleast one model name param above), defaults to False
|
285 |
+
--high_gpu high_gpu High GPU usage permitted, defaults to False
|
286 |
+
|
287 |
+
```
|
288 |
+
|
289 |
+
- At runtime (code)
|
290 |
+
|
291 |
+
> during surveyor object initialization with `surveyor_obj = Surveyor()`
|
292 |
+
- `pdf_dir`: String, pdf paper storage directory - defaults to `arxiv_data/tarpdfs/`
|
293 |
+
- `txt_dir`: String, text-converted paper storage directory - defaults to `arxiv_data/fulltext/`
|
294 |
+
- `img_dir`: String, image image storage directory - defaults to `arxiv_data/images/`
|
295 |
+
- `tab_dir`: String, tables storage directory - defaults to `arxiv_data/tables/`
|
296 |
+
- `dump_dir`: String, all_output_dir - defaults to `arxiv_dumps/`
|
297 |
+
- `models_dir`: String, directory to save to huge models, defaults to `saved_models/`
|
298 |
+
- `title_model_name`: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
299 |
+
- `ex_summ_model_name`: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
300 |
+
- `ledmodel_name`: String, led model(for abstractive summary) name/tag in hugging-face, defaults to `allenai/led-large-16384-arxiv`
|
301 |
+
- `embedder_name`: String, sentence embedder name/tag in hugging-face, defaults to `paraphrase-MiniLM-L6-v2`
|
302 |
+
- `nlp_name`: String, spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to `en_core_sci_scibert`
|
303 |
+
- `similarity_nlp_name`: String, spacy downstream trained model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to `en_core_sci_lg`
|
304 |
+
- `kw_model_name`: String, keyword extraction model name/tag in hugging-face, defaults to `distilbert-base-nli-mean-tokens`
|
305 |
+
- `high_gpu`: Bool, High GPU usage permitted, defaults to `False`
|
306 |
+
- `refresh_models`: Bool, Refresh model downloads with given names (needs atleast one model name param above), defaults to False
|
307 |
+
|
308 |
+
> during survey generation with `surveyor_obj.survey(query="my_research_query")`
|
309 |
+
- `max_search`: int maximium number of papers to gaze at - defaults to `100`
|
310 |
+
- `num_papers`: int maximium number of papers to download and analyse - defaults to `25`
|
311 |
+
|
312 |
+
|
313 |
+
|
src/Auto_Research.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LICENSE
|
2 |
+
README.md
|
3 |
+
pyproject.toml
|
4 |
+
setup.py
|
5 |
+
src/Auto_Research.egg-info/PKG-INFO
|
6 |
+
src/Auto_Research.egg-info/SOURCES.txt
|
7 |
+
src/Auto_Research.egg-info/dependency_links.txt
|
8 |
+
src/Auto_Research.egg-info/entry_points.txt
|
9 |
+
src/Auto_Research.egg-info/requires.txt
|
10 |
+
src/Auto_Research.egg-info/top_level.txt
|
src/Auto_Research.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz#egg=en_core_sci_scibert-0.5.0
|
2 |
+
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz#egg=en_core_sci_lg-0.5.0
|
src/Auto_Research.egg-info/entry_points.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[console_scripts]
|
2 |
+
cursive = src.Surveyor:main
|
src/Auto_Research.egg-info/requires.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip
|
2 |
+
boto3==1.9.118
|
3 |
+
requests==2.20.0
|
4 |
+
unicodedata2
|
5 |
+
pdfminer3k
|
6 |
+
sentence-transformers
|
7 |
+
pdftotext
|
8 |
+
arxiv
|
9 |
+
arxiv2bib
|
10 |
+
scholarly
|
11 |
+
PyMuPDF
|
12 |
+
Pillow
|
13 |
+
tabula-py
|
14 |
+
sentencepiece
|
15 |
+
keybert
|
16 |
+
scispacy
|
17 |
+
amrlib
|
18 |
+
transformers
|
19 |
+
en_core_sci_scibert
|
20 |
+
bert-extractive-summarizer
|
21 |
+
en_core_sci_lg
|
22 |
+
|
23 |
+
[spacy]
|
24 |
+
all
|
src/Auto_Research.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
src/Surveyor.py
ADDED
@@ -0,0 +1,1518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from arxiv_public_data.fulltext import convert_directory_parallel
|
2 |
+
from arxiv_public_data import internal_citations
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
from summarizer import Summarizer
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import spacy
|
8 |
+
import numpy as np
|
9 |
+
from keybert import KeyBERT
|
10 |
+
import shutil, joblib
|
11 |
+
from distutils.dir_util import copy_tree
|
12 |
+
|
13 |
+
try:
|
14 |
+
from transformers import *
|
15 |
+
except:
|
16 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, AutoModel, LEDTokenizer, \
|
17 |
+
LEDForConditionalGeneration
|
18 |
+
|
19 |
+
from src.defaults import DEFAULTS
|
20 |
+
|
21 |
+
|
22 |
+
class Surveyor:
|
23 |
+
'''
|
24 |
+
A class to abstract all nlp and data mining helper functions as well as workflows
|
25 |
+
required to generate the survey from a single query, with absolute configurability
|
26 |
+
'''
|
27 |
+
|
28 |
+
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
pdf_dir=None,
|
32 |
+
txt_dir=None,
|
33 |
+
img_dir=None,
|
34 |
+
tab_dir=None,
|
35 |
+
dump_dir=None,
|
36 |
+
models_dir=None,
|
37 |
+
title_model_name=None,
|
38 |
+
ex_summ_model_name=None,
|
39 |
+
ledmodel_name=None,
|
40 |
+
embedder_name=None,
|
41 |
+
nlp_name=None,
|
42 |
+
similarity_nlp_name=None,
|
43 |
+
kw_model_name=None,
|
44 |
+
high_gpu=False,
|
45 |
+
refresh_models=False,
|
46 |
+
no_save_models=False
|
47 |
+
):
|
48 |
+
'''
|
49 |
+
Initializes models and directory structure for the surveyor
|
50 |
+
|
51 |
+
Optional Params:
|
52 |
+
- pdf_dir: String, pdf paper storage directory - defaults to arxiv_data/tarpdfs/
|
53 |
+
- txt_dir: String, text-converted paper storage directory - defaults to arxiv_data/fulltext/
|
54 |
+
- img_dir: String, image image storage directory - defaults to arxiv_data/images/
|
55 |
+
- tab_dir: String, tables storage directory - defaults to arxiv_data/tables/
|
56 |
+
- dump_dir: String, all_output_dir - defaults to arxiv_dumps/
|
57 |
+
- models_dir: String, directory to save to huge models
|
58 |
+
- title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
59 |
+
- ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
60 |
+
- ledmodel_name: String, led model(for abstractive summary) name/tag in hugging-face, defaults to `allenai/led-large-16384-arxiv`
|
61 |
+
- embedder_name: String, sentence embedder name/tag in hugging-face, defaults to `paraphrase-MiniLM-L6-v2`
|
62 |
+
- nlp_name: String, spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to `en_core_sci_scibert`
|
63 |
+
- similarity_nlp_name: String, spacy downstream trained model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to `en_core_sci_lg`
|
64 |
+
- kw_model_name: String, keyword extraction model name/tag in hugging-face, defaults to `distilbert-base-nli-mean-tokens`
|
65 |
+
- high_gpu: Bool, High GPU usage permitted, defaults to False
|
66 |
+
- refresh_models: Bool, Refresh model downloads with given names (needs atleast one model name param above), defaults to False
|
67 |
+
- no_save_models: forces refresh models
|
68 |
+
|
69 |
+
- max_search: int maximium number of papers to gaze at - defaults to 100
|
70 |
+
- num_papers: int maximium number of papers to download and analyse - defaults to 25
|
71 |
+
|
72 |
+
'''
|
73 |
+
self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
74 |
+
print("\nTorch_device: " + self.torch_device)
|
75 |
+
if 'cuda' in self.torch_device:
|
76 |
+
print("\nloading spacy for gpu")
|
77 |
+
spacy.require_gpu()
|
78 |
+
|
79 |
+
if not kw_model_name:
|
80 |
+
kw_model_name = DEFAULTS["kw_model_name"]
|
81 |
+
if not high_gpu:
|
82 |
+
self.high_gpu = DEFAULTS["high_gpu"]
|
83 |
+
else:
|
84 |
+
self.high_gpu = high_gpu
|
85 |
+
self.num_papers = DEFAULTS['num_papers']
|
86 |
+
self.max_search = DEFAULTS['max_search']
|
87 |
+
if not models_dir:
|
88 |
+
models_dir = DEFAULTS['models_dir']
|
89 |
+
|
90 |
+
models_found = False
|
91 |
+
if os.path.exists(models_dir) and not no_save_models:
|
92 |
+
if len(os.listdir(models_dir)) > 6:
|
93 |
+
models_found = True
|
94 |
+
|
95 |
+
if not title_model_name:
|
96 |
+
title_model_name = DEFAULTS["title_model_name"]
|
97 |
+
if not ex_summ_model_name:
|
98 |
+
ex_summ_model_name = DEFAULTS["ex_summ_model_name"]
|
99 |
+
if not ledmodel_name:
|
100 |
+
ledmodel_name = DEFAULTS["ledmodel_name"]
|
101 |
+
if not embedder_name:
|
102 |
+
embedder_name = DEFAULTS["embedder_name"]
|
103 |
+
if not nlp_name:
|
104 |
+
nlp_name = DEFAULTS["nlp_name"]
|
105 |
+
if not similarity_nlp_name:
|
106 |
+
similarity_nlp_name = DEFAULTS["similarity_nlp_name"]
|
107 |
+
|
108 |
+
if refresh_models or not models_found:
|
109 |
+
print(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
|
110 |
+
if not no_save_models:
|
111 |
+
self.clean_dirs([models_dir])
|
112 |
+
|
113 |
+
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
114 |
+
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(title_model_name).to(self.torch_device)
|
115 |
+
self.title_model.eval()
|
116 |
+
if not no_save_models:
|
117 |
+
self.title_model.save_pretrained(models_dir + "/title_model")
|
118 |
+
#self.title_tokenizer.save_pretrained(models_dir + "/title_tokenizer")
|
119 |
+
|
120 |
+
# summary model
|
121 |
+
self.custom_config = AutoConfig.from_pretrained(ex_summ_model_name)
|
122 |
+
self.custom_config.output_hidden_states = True
|
123 |
+
self.summ_tokenizer = AutoTokenizer.from_pretrained(ex_summ_model_name)
|
124 |
+
self.summ_model = AutoModel.from_pretrained(ex_summ_model_name, config=self.custom_config).to(
|
125 |
+
self.torch_device)
|
126 |
+
self.summ_model.eval()
|
127 |
+
if not no_save_models:
|
128 |
+
self.summ_model.save_pretrained(models_dir + "/summ_model")
|
129 |
+
#self.summ_tokenizer.save_pretrained(models_dir + "/summ_tokenizer")
|
130 |
+
self.model = Summarizer(custom_model=self.summ_model, custom_tokenizer=self.summ_tokenizer)
|
131 |
+
|
132 |
+
self.ledtokenizer = LEDTokenizer.from_pretrained(ledmodel_name)
|
133 |
+
self.ledmodel = LEDForConditionalGeneration.from_pretrained(ledmodel_name).to(self.torch_device)
|
134 |
+
self.ledmodel.eval()
|
135 |
+
if not no_save_models:
|
136 |
+
self.ledmodel.save_pretrained(models_dir + "/ledmodel")
|
137 |
+
#self.ledtokenizer.save_pretrained(models_dir + "/ledtokenizer")
|
138 |
+
|
139 |
+
self.embedder = SentenceTransformer(embedder_name)
|
140 |
+
self.embedder.eval()
|
141 |
+
if not no_save_models:
|
142 |
+
self.embedder.save(models_dir + "/embedder")
|
143 |
+
else:
|
144 |
+
print("\nInitializing from previously saved models at" + models_dir)
|
145 |
+
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
146 |
+
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
147 |
+
self.title_model.eval()
|
148 |
+
|
149 |
+
# summary model
|
150 |
+
#self.summ_config = AutoConfig.from_pretrained(ex_summ_model_name)
|
151 |
+
#self.summ_config.output_hidden_states = True
|
152 |
+
self.summ_tokenizer = AutoTokenizer.from_pretrained(ex_summ_model_name)
|
153 |
+
self.summ_model = AutoModel.from_pretrained(models_dir + "/summ_model").to(
|
154 |
+
self.torch_device)
|
155 |
+
self.summ_model.eval()
|
156 |
+
self.model = Summarizer(custom_model=self.summ_model, custom_tokenizer=self.summ_tokenizer)
|
157 |
+
|
158 |
+
self.ledtokenizer = LEDTokenizer.from_pretrained(ledmodel_name)
|
159 |
+
self.ledmodel = LEDForConditionalGeneration.from_pretrained(models_dir + "/ledmodel").to(self.torch_device)
|
160 |
+
self.ledmodel.eval()
|
161 |
+
|
162 |
+
self.embedder = SentenceTransformer(models_dir + "/embedder")
|
163 |
+
self.embedder.eval()
|
164 |
+
|
165 |
+
self.nlp = spacy.load(nlp_name)
|
166 |
+
self.similarity_nlp = spacy.load(similarity_nlp_name)
|
167 |
+
self.kw_model = KeyBERT(kw_model_name)
|
168 |
+
|
169 |
+
self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
|
170 |
+
|
171 |
+
def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
172 |
+
|
173 |
+
if pdf_dir:
|
174 |
+
self.pdf_dir = pdf_dir
|
175 |
+
else:
|
176 |
+
self.pdf_dir = DEFAULTS["pdf_dir"]
|
177 |
+
|
178 |
+
if txt_dir:
|
179 |
+
self.txt_dir = txt_dir
|
180 |
+
else:
|
181 |
+
self.txt_dir = DEFAULTS["txt_dir"]
|
182 |
+
|
183 |
+
if img_dir:
|
184 |
+
self.img_dir = img_dir
|
185 |
+
else:
|
186 |
+
self.img_dir = DEFAULTS["img_dir"]
|
187 |
+
|
188 |
+
if tab_dir:
|
189 |
+
self.tab_dir = tab_dir
|
190 |
+
else:
|
191 |
+
self.tab_dir = DEFAULTS["tab_dir"]
|
192 |
+
|
193 |
+
if dump_dir:
|
194 |
+
self.dump_dir = dump_dir
|
195 |
+
else:
|
196 |
+
self.dump_dir = DEFAULTS["dump_dir"]
|
197 |
+
|
198 |
+
dirs = [self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir]
|
199 |
+
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
200 |
+
base = os.path.dirname("arxiv_data/")
|
201 |
+
if not os.path.exists(base):
|
202 |
+
os.mkdir(base)
|
203 |
+
self.clean_dirs(dirs)
|
204 |
+
|
205 |
+
def clean_dirs(self, dirs):
|
206 |
+
import shutil
|
207 |
+
for d in dirs:
|
208 |
+
if os.path.exists(d):
|
209 |
+
shutil.rmtree(d)
|
210 |
+
os.mkdir(d)
|
211 |
+
|
212 |
+
def pdf_route(self, pdf_dir, txt_dir, img_dir, tab_dir, dump_dir, papers_meta):
|
213 |
+
## Data prep
|
214 |
+
|
215 |
+
import joblib
|
216 |
+
# test full again - check images - check dfs !!
|
217 |
+
|
218 |
+
self.clean_dirs([pdf_dir, txt_dir, img_dir, tab_dir, dump_dir])
|
219 |
+
|
220 |
+
papers = papers_meta[:self.num_papers]
|
221 |
+
selected_papers = papers
|
222 |
+
print("\nFirst stage paper collection...")
|
223 |
+
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
224 |
+
print("\nFirst stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
225 |
+
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
226 |
+
_ = self.get_freq_cited(cites)
|
227 |
+
'''
|
228 |
+
filtered_idlist = []
|
229 |
+
for c in self.get_freq_cited(cites):
|
230 |
+
if c in
|
231 |
+
_, new_searched_papers = self.search(filtered_idlist)
|
232 |
+
new_papers.extend(new_searched_papers)
|
233 |
+
'''
|
234 |
+
selected_papers.extend(new_papers)
|
235 |
+
print("\nSecond stage paper collection...")
|
236 |
+
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
237 |
+
print("\nSecond stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
238 |
+
papers.extend(new_papers)
|
239 |
+
|
240 |
+
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
241 |
+
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
242 |
+
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
243 |
+
|
244 |
+
print("\nExtracting section-wise highlights.. ")
|
245 |
+
papers = self.extract_highlights(papers)
|
246 |
+
|
247 |
+
return papers, selected_papers
|
248 |
+
|
249 |
+
|
250 |
+
def get_freq_cited(self, cites_dict, k=5):
|
251 |
+
cites_list = []
|
252 |
+
for k, v in cites_dict.items():
|
253 |
+
cites_list.append(k)
|
254 |
+
[cites_list.append(val) for val in v]
|
255 |
+
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
256 |
+
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
257 |
+
print("\nThe most cited paper ids are:\n" + str(sorted_cites))
|
258 |
+
|
259 |
+
return sorted_cites.keys()
|
260 |
+
|
261 |
+
|
262 |
+
def fetch_papers(self, dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir, repeat=False):
|
263 |
+
import tempfile
|
264 |
+
|
265 |
+
if repeat:
|
266 |
+
with tempfile.TemporaryDirectory() as dirpath:
|
267 |
+
print("\n- downloading extra pdfs.. ")
|
268 |
+
# full text preparation of selected papers
|
269 |
+
self.download_pdfs(papers, dirpath)
|
270 |
+
dirpath_pdfs = os.listdir(dirpath)
|
271 |
+
for file_name in dirpath_pdfs:
|
272 |
+
full_file_name = os.path.join(dirpath, file_name)
|
273 |
+
if os.path.isfile(full_file_name):
|
274 |
+
shutil.copy(full_file_name, pdf_dir)
|
275 |
+
print("\n- converting extra pdfs.. ")
|
276 |
+
self.convert_pdfs(dirpath, txt_dir)
|
277 |
+
else:
|
278 |
+
print("\n- downloading pdfs.. ")
|
279 |
+
# full text preparation of selected papers
|
280 |
+
self.download_pdfs(papers, pdf_dir)
|
281 |
+
print("\n- converting pdfs.. ")
|
282 |
+
self.convert_pdfs(pdf_dir, txt_dir)
|
283 |
+
# plugging citations to our papers object
|
284 |
+
print("\n- plugging in citation network.. ")
|
285 |
+
papers, cites = self.cocitation_network(papers, txt_dir)
|
286 |
+
joblib.dump(papers, dump_dir + 'papers_selected_pdf_route.dmp')
|
287 |
+
from distutils.dir_util import copy_tree
|
288 |
+
copy_tree(txt_dir, dump_dir + os.path.basename(txt_dir))
|
289 |
+
copy_tree(pdf_dir, dump_dir + os.path.basename(pdf_dir))
|
290 |
+
print("\n- extracting structure.. ")
|
291 |
+
papers, ids_none = self.extract_structure(papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir)
|
292 |
+
return ids_none, papers, cites
|
293 |
+
|
294 |
+
def tar_route(self, pdf_dir, txt_dir, img_dir, tab_dir, papers):
|
295 |
+
## Data prep
|
296 |
+
|
297 |
+
import joblib
|
298 |
+
# test full again - check images - check dfs !!
|
299 |
+
|
300 |
+
self.clean_dirs([pdf_dir, txt_dir, img_dir, tab_dir])
|
301 |
+
|
302 |
+
# full text preparation of selected papers
|
303 |
+
self.download_sources(papers, pdf_dir)
|
304 |
+
self.convert_pdfs(pdf_dir, txt_dir)
|
305 |
+
|
306 |
+
# plugging citations to our papers object
|
307 |
+
papers, cites = self.cocitation_network(papers, txt_dir)
|
308 |
+
|
309 |
+
joblib.dump(papers, 'papers_selected_tar_route.dmp')
|
310 |
+
|
311 |
+
papers = self.extract_structure(papers, pdf_dir, txt_dir, img_dir, tab_dir)
|
312 |
+
|
313 |
+
joblib.dump(papers, 'papers_extracted_tar_route.dmp')
|
314 |
+
|
315 |
+
return papers
|
316 |
+
|
317 |
+
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
318 |
+
|
319 |
+
import arxiv2bib
|
320 |
+
print("\nbuilding bibliography entries.. ")
|
321 |
+
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
322 |
+
bibentries = [r.bibtex() for r in bibentries]
|
323 |
+
|
324 |
+
print("\nbuilding final survey file .. at "+ filename)
|
325 |
+
file = open(filename, 'w+')
|
326 |
+
if query is None:
|
327 |
+
query = 'Internal(existing) research'
|
328 |
+
file.write("----------------------------------------------------------------------")
|
329 |
+
file.write("Title: A survey on " + query)
|
330 |
+
print("")
|
331 |
+
print("----------------------------------------------------------------------")
|
332 |
+
print("Title: A survey on " + query)
|
333 |
+
file.write("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
|
334 |
+
print("Author: Auto-Research (github.com/sidphbot/Auto-Research)")
|
335 |
+
file.write("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
|
336 |
+
print("Dev: Auto-Research (github.com/sidphbot/Auto-Research)")
|
337 |
+
file.write("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
|
338 |
+
"\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
|
339 |
+
"\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
|
340 |
+
"\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
|
341 |
+
"\nentries(only to avoid LaTex overhead). ")
|
342 |
+
print("Disclaimer: This survey is intended to be a research starter. This Survey is Machine-Summarized, "+
|
343 |
+
"\nhence some sentences might be wrangled or grammatically incorrect. However all sentences are "+
|
344 |
+
"\nmined with proper citations. As All of the text is practically quoted texted, hence to "+
|
345 |
+
"\nimprove visibility, all the papers are duly cited in the Bibiliography section. as bibtex "+
|
346 |
+
"\nentries(only to avoid LaTex overhead). ")
|
347 |
+
file.write("----------------------------------------------------------------------")
|
348 |
+
print("----------------------------------------------------------------------")
|
349 |
+
file.write("")
|
350 |
+
print("")
|
351 |
+
file.write('ABSTRACT')
|
352 |
+
print('ABSTRACT')
|
353 |
+
print("=================================================")
|
354 |
+
file.write("=================================================")
|
355 |
+
file.write("")
|
356 |
+
print("")
|
357 |
+
file.write(research_sections['abstract'])
|
358 |
+
print(research_sections['abstract'])
|
359 |
+
file.write("")
|
360 |
+
print("")
|
361 |
+
file.write('INTRODUCTION')
|
362 |
+
print('INTRODUCTION')
|
363 |
+
print("=================================================")
|
364 |
+
file.write("=================================================")
|
365 |
+
file.write("")
|
366 |
+
print("")
|
367 |
+
file.write(research_sections['introduction'])
|
368 |
+
print(research_sections['introduction'])
|
369 |
+
file.write("")
|
370 |
+
print("")
|
371 |
+
for k, v in research_sections.items():
|
372 |
+
if k not in ['abstract', 'introduction', 'conclusion']:
|
373 |
+
file.write(k.upper())
|
374 |
+
print(k.upper())
|
375 |
+
print("=================================================")
|
376 |
+
file.write("=================================================")
|
377 |
+
file.write("")
|
378 |
+
print("")
|
379 |
+
file.write(v)
|
380 |
+
print(v)
|
381 |
+
file.write("")
|
382 |
+
print("")
|
383 |
+
file.write('CONCLUSION')
|
384 |
+
print('CONCLUSION')
|
385 |
+
print("=================================================")
|
386 |
+
file.write("=================================================")
|
387 |
+
file.write("")
|
388 |
+
print("")
|
389 |
+
file.write(research_sections['conclusion'])
|
390 |
+
print(research_sections['conclusion'])
|
391 |
+
file.write("")
|
392 |
+
print("")
|
393 |
+
|
394 |
+
file.write('REFERENCES')
|
395 |
+
print('REFERENCES')
|
396 |
+
print("=================================================")
|
397 |
+
file.write("=================================================")
|
398 |
+
file.write("")
|
399 |
+
print("")
|
400 |
+
for entry in bibentries:
|
401 |
+
file.write(entry)
|
402 |
+
print(entry)
|
403 |
+
file.write("")
|
404 |
+
print("")
|
405 |
+
print("========================XXX=========================")
|
406 |
+
file.write("========================XXX=========================")
|
407 |
+
file.close()
|
408 |
+
|
409 |
+
def build_basic_blocks(self, corpus_known_sections, corpus):
|
410 |
+
|
411 |
+
research_blocks = {}
|
412 |
+
for head, textarr in corpus_known_sections.items():
|
413 |
+
torch.cuda.empty_cache()
|
414 |
+
# print(head.upper())
|
415 |
+
with torch.no_grad():
|
416 |
+
summtext = self.model(" ".join([l.lower() for l in textarr]), ratio=0.5)
|
417 |
+
res = self.nlp(summtext)
|
418 |
+
res = set([str(sent) for sent in list(res.sents)])
|
419 |
+
summtext = ''.join([line for line in res])
|
420 |
+
# pprint(summtext)
|
421 |
+
research_blocks[head] = summtext
|
422 |
+
|
423 |
+
return research_blocks
|
424 |
+
|
425 |
+
def abstractive_summary(self, longtext):
|
426 |
+
'''
|
427 |
+
faulty method
|
428 |
+
input_ids = ledtokenizer(longtext, return_tensors="pt").input_ids
|
429 |
+
global_attention_mask = torch.zeros_like(input_ids)
|
430 |
+
# set global_attention_mask on first token
|
431 |
+
global_attention_mask[:, 0] = 1
|
432 |
+
|
433 |
+
sequences = ledmodel.generate(input_ids, global_attention_mask=global_attention_mask).sequences
|
434 |
+
summary = ledtokenizer.batch_decode(sequences)
|
435 |
+
'''
|
436 |
+
torch.cuda.empty_cache()
|
437 |
+
inputs = self.ledtokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
|
438 |
+
return_tensors='pt').to(self.torch_device)
|
439 |
+
with torch.no_grad():
|
440 |
+
summary_ids = self.ledmodel.generate(**inputs)
|
441 |
+
summary = self.ledtokenizer.batch_decode(summary_ids, skip_special_tokens=True,
|
442 |
+
clean_up_tokenization_spaces=True)
|
443 |
+
res = self.nlp(summary[0])
|
444 |
+
res = set([str(sent) for sent in list(res.sents)])
|
445 |
+
summtext = ''.join([line for line in res])
|
446 |
+
#print("abstractive summary type:" + str(type(summary)))
|
447 |
+
return summtext
|
448 |
+
|
449 |
+
def get_abstract(self, abs_lines, corpus_known_sections, research_blocks):
|
450 |
+
|
451 |
+
# abs_lines = " ".join(abs_lines)
|
452 |
+
abs_lines = ""
|
453 |
+
abs_lines += " ".join([l.lower() for l in corpus_known_sections['abstract']])
|
454 |
+
abs_lines += research_blocks['abstract']
|
455 |
+
# print(abs_lines)
|
456 |
+
|
457 |
+
try:
|
458 |
+
return self.abstractive_summary(abs_lines)
|
459 |
+
except:
|
460 |
+
highlights = self.extractive_summary(abs_lines)
|
461 |
+
return self.abstractive_summary(highlights)
|
462 |
+
|
463 |
+
def get_corpus_lines(self, corpus):
|
464 |
+
abs_lines = []
|
465 |
+
types = set()
|
466 |
+
for k, v in corpus.items():
|
467 |
+
# print(v)
|
468 |
+
types.add(type(v))
|
469 |
+
abstext = k + '. ' + v.replace('\n', ' ')
|
470 |
+
abstext = self.nlp(abstext)
|
471 |
+
abs_lines.extend([str(sent).lower() for sent in list(abstext.sents)])
|
472 |
+
#print("unique corpus value types:" + str(types))
|
473 |
+
# abs_lines = '\n'.join([str(sent) for sent in abs_lines.sents])
|
474 |
+
return abs_lines
|
475 |
+
|
476 |
+
def get_sectioned_docs(self, papers, papers_meta):
|
477 |
+
import random
|
478 |
+
docs = []
|
479 |
+
for p in papers:
|
480 |
+
for section in p['sections']:
|
481 |
+
if len(section['highlights']) > 0:
|
482 |
+
if self.high_gpu:
|
483 |
+
content = self.generate_title(section['highlights'])
|
484 |
+
else:
|
485 |
+
content = self.extractive_summary(''.join(section['highlights']))
|
486 |
+
docs.append(content)
|
487 |
+
selected_pids = [p['id'] for p in papers]
|
488 |
+
meta_abs = []
|
489 |
+
for p in papers_meta:
|
490 |
+
if p['id'] not in selected_pids:
|
491 |
+
meta_abs.append(self.generate_title(p['abstract']))
|
492 |
+
docs.extend(meta_abs)
|
493 |
+
#print("meta_abs num"+str(len(meta_abs)))
|
494 |
+
#print("selected_pids num"+str(len(selected_pids)))
|
495 |
+
#print("papers_meta num"+str(len(papers_meta)))
|
496 |
+
#assert (len(meta_abs) + len(selected_pids) == len(papers_meta))
|
497 |
+
assert ('str' in str(type(random.sample(docs, 1)[0])))
|
498 |
+
return [doc for doc in docs if doc != '']
|
499 |
+
|
500 |
+
|
501 |
+
def cluster_lines(self, abs_lines):
|
502 |
+
from sklearn.cluster import KMeans
|
503 |
+
# from bertopic import BERTopic
|
504 |
+
# topic_model = BERTopic(embedding_model=embedder)
|
505 |
+
torch.cuda.empty_cache()
|
506 |
+
corpus_embeddings = self.embedder.encode(abs_lines)
|
507 |
+
# Normalize the embeddings to unit length
|
508 |
+
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
|
509 |
+
with torch.no_grad():
|
510 |
+
optimal_k = self.model.calculate_optimal_k(' '.join(abs_lines), k_max=10)
|
511 |
+
# Perform kmean clustering
|
512 |
+
|
513 |
+
clustering_model = KMeans(n_clusters=optimal_k, n_init=20, n_jobs=-1)
|
514 |
+
# clustering_model = AgglomerativeClustering(n_clusters=optimal_k, affinity='cosine', linkage='average') #, affinity='cosine', linkage='average', distance_threshold=0.4)
|
515 |
+
clustering_model.fit(corpus_embeddings)
|
516 |
+
cluster_assignment = clustering_model.labels_
|
517 |
+
|
518 |
+
clustered_sentences = {}
|
519 |
+
dummy_count = 0
|
520 |
+
for sentence_id, cluster_id in enumerate(cluster_assignment):
|
521 |
+
if cluster_id not in clustered_sentences:
|
522 |
+
clustered_sentences[cluster_id] = []
|
523 |
+
'''
|
524 |
+
if dummy_count < 5:
|
525 |
+
print("abs_line: "+abs_lines[sentence_id])
|
526 |
+
print("cluster_ID: "+str(cluster_id))
|
527 |
+
print("embedding: "+str(corpus_embeddings[sentence_id]))
|
528 |
+
dummy_count += 1
|
529 |
+
'''
|
530 |
+
clustered_sentences[cluster_id].append(abs_lines[sentence_id])
|
531 |
+
|
532 |
+
# for i, cluster in clustered_sentences.items():
|
533 |
+
# print("Cluster ", i+1)
|
534 |
+
# print(cluster)
|
535 |
+
# print("")
|
536 |
+
|
537 |
+
return self.get_clustered_sections(clustered_sentences), clustered_sentences
|
538 |
+
|
539 |
+
|
540 |
+
def get_clusters(self, papers, papers_meta):
|
541 |
+
from sklearn.cluster import KMeans
|
542 |
+
# from bertopic import BERTopic
|
543 |
+
# topic_model = BERTopic(embedding_model=embedder)
|
544 |
+
torch.cuda.empty_cache()
|
545 |
+
abs_lines = self.get_sectioned_docs(papers, papers_meta)
|
546 |
+
corpus_embeddings = self.embedder.encode(abs_lines)
|
547 |
+
# Normalize the embeddings to unit length
|
548 |
+
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
|
549 |
+
with torch.no_grad():
|
550 |
+
optimal_k = self.model.calculate_optimal_k(' '.join(abs_lines), k_max=10)
|
551 |
+
# Perform kmean clustering
|
552 |
+
|
553 |
+
clustering_model = KMeans(n_clusters=optimal_k, n_init=20, n_jobs=-1)
|
554 |
+
# clustering_model = AgglomerativeClustering(n_clusters=optimal_k, affinity='cosine', linkage='average') #, affinity='cosine', linkage='average', distance_threshold=0.4)
|
555 |
+
clustering_model.fit(corpus_embeddings)
|
556 |
+
cluster_assignment = clustering_model.labels_
|
557 |
+
|
558 |
+
clustered_sentences = {}
|
559 |
+
dummy_count = 0
|
560 |
+
for sentence_id, cluster_id in enumerate(cluster_assignment):
|
561 |
+
if cluster_id not in clustered_sentences:
|
562 |
+
clustered_sentences[cluster_id] = []
|
563 |
+
'''
|
564 |
+
if dummy_count < 5:
|
565 |
+
print("abs_line: "+abs_lines[sentence_id])
|
566 |
+
print("cluster_ID: "+str(cluster_id))
|
567 |
+
print("embedding: "+str(corpus_embeddings[sentence_id]))
|
568 |
+
dummy_count += 1
|
569 |
+
'''
|
570 |
+
clustered_sentences[cluster_id].append(abs_lines[sentence_id])
|
571 |
+
|
572 |
+
# for i, cluster in clustered_sentences.items():
|
573 |
+
# print("Cluster ", i+1)
|
574 |
+
# print(cluster)
|
575 |
+
# print("")
|
576 |
+
|
577 |
+
return self.get_clustered_sections(clustered_sentences), clustered_sentences
|
578 |
+
|
579 |
+
def generate_title(self, longtext):
|
580 |
+
torch.cuda.empty_cache()
|
581 |
+
|
582 |
+
inputs = self.title_tokenizer.prepare_seq2seq_batch(longtext, truncation=True, padding='longest',
|
583 |
+
return_tensors='pt').to(self.torch_device)
|
584 |
+
with torch.no_grad():
|
585 |
+
summary_ids = self.title_model.generate(**inputs)
|
586 |
+
summary = self.title_tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
|
587 |
+
clean_up_tokenization_spaces=True)
|
588 |
+
|
589 |
+
return str(summary[0])
|
590 |
+
|
591 |
+
def get_clustered_sections(self, clustered_lines):
|
592 |
+
clusters_dict = {}
|
593 |
+
for i, cluster in clustered_lines.items():
|
594 |
+
# print(cluster)
|
595 |
+
try:
|
596 |
+
clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
|
597 |
+
str(" ".join(cluster)).lower())
|
598 |
+
except:
|
599 |
+
clusters_dict[self.generate_title(str(" ".join(cluster)))] = self.abstractive_summary(
|
600 |
+
self.extractive_summary(str(" ".join(cluster)).lower()))
|
601 |
+
|
602 |
+
return clusters_dict
|
603 |
+
|
604 |
+
def get_intro(self, corpus_known_sections, research_blocks):
|
605 |
+
intro_lines = ""
|
606 |
+
intro_lines += str(" ".join([l.lower() for l in corpus_known_sections['introduction']])) + str(
|
607 |
+
" ".join([l.lower() for l in corpus_known_sections['conclusion']]))
|
608 |
+
intro_lines += research_blocks['introduction'] + research_blocks['conclusion']
|
609 |
+
try:
|
610 |
+
return self.abstractive_summary(intro_lines)
|
611 |
+
except:
|
612 |
+
return self.abstractive_summary(self.extractive_summary(intro_lines))
|
613 |
+
|
614 |
+
def get_conclusion(self, research_sections):
|
615 |
+
paper_body = ""
|
616 |
+
for k, v in research_sections.items():
|
617 |
+
paper_body += v
|
618 |
+
return self.abstractive_summary(paper_body)
|
619 |
+
|
620 |
+
def build_corpus_sectionwise(self, papers):
|
621 |
+
known = ['abstract', 'introduction', 'conclusion']
|
622 |
+
corpus_known_sections = {}
|
623 |
+
for kh in known:
|
624 |
+
khtext = []
|
625 |
+
for p in papers:
|
626 |
+
for section in p['sections']:
|
627 |
+
if kh in section['heading']:
|
628 |
+
khtext.extend(section['highlights'])
|
629 |
+
# print(khtext)
|
630 |
+
corpus_known_sections[kh] = khtext
|
631 |
+
return corpus_known_sections
|
632 |
+
|
633 |
+
def standardize_headings(self, papers):
|
634 |
+
known = ['abstract', 'introduction', 'discussion', 'relatedwork', 'contribution', 'analysis', 'experiments',
|
635 |
+
'conclusion']
|
636 |
+
for p in papers:
|
637 |
+
# print("================================")
|
638 |
+
headings = [section['heading'] for section in p['sections'] if len(section['heading'].split()) < 3]
|
639 |
+
# print("id: "+ str(p['id'])+"\nHeadings: \n"+str('\n'.join(headings)))
|
640 |
+
for kh in known:
|
641 |
+
for section in p['sections']:
|
642 |
+
if len(section['heading'].split()) < 3:
|
643 |
+
# print(section['heading'])
|
644 |
+
if kh in ''.join(filter(str.isalpha, section['heading'].replace(' ', '').lower())):
|
645 |
+
# print("orig head: "+ section['heading'] +", plain head:" + kh)
|
646 |
+
section['heading'] = kh
|
647 |
+
return papers
|
648 |
+
|
649 |
+
def build_corpus(self, papers, papers_meta):
|
650 |
+
corpus = self.build_meta_corpus(papers_meta)
|
651 |
+
for p in papers:
|
652 |
+
ph = []
|
653 |
+
for sid, section in enumerate(p['sections']):
|
654 |
+
ph.extend(section['highlights'])
|
655 |
+
for pid, ls in corpus.items():
|
656 |
+
if pid == p['id']:
|
657 |
+
corpus[pid] = p['abstract'] + str(' '.join(ph))
|
658 |
+
'''
|
659 |
+
print("================== final corpus ====================")
|
660 |
+
print('\n'.join([str("paper: "+ get_by_pid(pid, papers_meta)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in corpus.items()]))
|
661 |
+
print("======== sample point ========")
|
662 |
+
p = random.choice(list(papers))
|
663 |
+
print("paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
|
664 |
+
print("======== sample meta point ========")
|
665 |
+
p = random.choice(list(papers_meta))
|
666 |
+
print("meta paper: "+ p['title']+" \nhighlights: " + str(corpus[p['id']]))
|
667 |
+
'''
|
668 |
+
return corpus
|
669 |
+
|
670 |
+
def get_by_pid(self, pid, papers):
|
671 |
+
for p in papers:
|
672 |
+
if p['id'] == pid:
|
673 |
+
return p
|
674 |
+
|
675 |
+
def build_meta_corpus(self, papers):
|
676 |
+
meta_corpus = {}
|
677 |
+
for p in papers:
|
678 |
+
# pprint(p)
|
679 |
+
pid = p['id']
|
680 |
+
ptext = p['title'] + ". " + p['abstract']
|
681 |
+
doc = self.nlp(ptext)
|
682 |
+
phs, _, _ = self.extractive_highlights([str(sent) for sent in list(doc.sents)])
|
683 |
+
meta_corpus[pid] = str(' '.join(phs))
|
684 |
+
'''
|
685 |
+
print("================== meta corpus ====================")
|
686 |
+
print('\n'.join([str("paper: "+ get_by_pid(pid, papers)['title']+" \nhighlight count: " + str(len(phs))) for pid, phs in meta_corpus.items()]))
|
687 |
+
print("======== sample point ========")
|
688 |
+
p = random.choice(list(papers))
|
689 |
+
print("paper: "+ p['title']+" \nhighlights: " + str(meta_corpus[p['id']]))
|
690 |
+
'''
|
691 |
+
return meta_corpus
|
692 |
+
|
693 |
+
def select_papers(self, papers, query, num_papers=20):
|
694 |
+
import numpy as np
|
695 |
+
# print("paper sample: ")
|
696 |
+
# print(papers)
|
697 |
+
meta_corpus = self.build_meta_corpus(papers)
|
698 |
+
scores = []
|
699 |
+
pids = []
|
700 |
+
for id, highlights in meta_corpus.items():
|
701 |
+
score = self.text_para_similarity(query, highlights)
|
702 |
+
scores.append(score)
|
703 |
+
pids.append(id)
|
704 |
+
print("corpus item: " + str(self.get_by_pid(id, papers)['title']))
|
705 |
+
|
706 |
+
idx = np.argsort(scores)[:num_papers]
|
707 |
+
#for i in range(len(scores)):
|
708 |
+
# print("paper: " + str(self.get_by_pid(pids[i], papers)['title']))
|
709 |
+
# print("score: " + str(scores[i]))
|
710 |
+
# print("argsort ids("+str(num_papers)+" papers): "+ str(idx))
|
711 |
+
idx = [pids[i] for i in idx]
|
712 |
+
# print("argsort pids("+str(num_papers)+" papers): "+ str(idx))
|
713 |
+
papers_selected = [p for p in papers if p['id'] in idx]
|
714 |
+
# assert(len(papers_selected)==num_papers)
|
715 |
+
print("num papers selected: " + str(len(papers_selected)))
|
716 |
+
for p in papers_selected:
|
717 |
+
print("Selected Paper: " + p['title'])
|
718 |
+
|
719 |
+
print("constrast with natural selection: forward")
|
720 |
+
for p in papers[:4]:
|
721 |
+
print("Selected Paper: " + p['title'])
|
722 |
+
print("constrast with natural selection: backward")
|
723 |
+
for p in papers[-4:]:
|
724 |
+
print("Selected Paper: " + p['title'])
|
725 |
+
# arxiv search producing better relevnce
|
726 |
+
return papers_selected
|
727 |
+
|
728 |
+
def extractive_summary(self, text):
|
729 |
+
torch.cuda.empty_cache()
|
730 |
+
with torch.no_grad():
|
731 |
+
res = self.model(text, ratio=0.5)
|
732 |
+
res_doc = self.nlp(res)
|
733 |
+
return " ".join(set([str(sent) for sent in list(res_doc.sents)]))
|
734 |
+
|
735 |
+
def extractive_highlights(self, lines):
|
736 |
+
# text = " ".join(lines)
|
737 |
+
# text_doc = nlp(" ".join([l.lower() for l in lines]))
|
738 |
+
# text = ' '.join([ str(sent) for sent in list(text_doc.sents)])
|
739 |
+
torch.cuda.empty_cache()
|
740 |
+
with torch.no_grad():
|
741 |
+
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
742 |
+
res_doc = self.nlp(res)
|
743 |
+
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
744 |
+
# print("\n".join(res_sents))
|
745 |
+
with torch.no_grad():
|
746 |
+
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
747 |
+
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
748 |
+
keyphrase_ngram_range=(4, 4),
|
749 |
+
stop_words='english', use_mmr=True, diversity=0.7)
|
750 |
+
return res_lines, keywords, keyphrases
|
751 |
+
|
752 |
+
def extract_highlights(self, papers):
|
753 |
+
for p in papers:
|
754 |
+
sid = 0
|
755 |
+
p['sections'] = []
|
756 |
+
for heading, lines in p['body_text'].items():
|
757 |
+
hs, kws, kps = self.extractive_highlights(lines)
|
758 |
+
p['sections'].append({
|
759 |
+
'sid': sid,
|
760 |
+
'heading': heading,
|
761 |
+
'text': lines,
|
762 |
+
'highlights': hs,
|
763 |
+
'keywords': kws,
|
764 |
+
'keyphrases': kps,
|
765 |
+
})
|
766 |
+
sid += 1
|
767 |
+
return papers
|
768 |
+
|
769 |
+
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
770 |
+
print("\nextracting sections.. ")
|
771 |
+
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
772 |
+
|
773 |
+
print("\nextracting images.. for future correlation use-cases ")
|
774 |
+
papers = self.extract_images(papers, pdf_dir, img_dir)
|
775 |
+
|
776 |
+
if tables:
|
777 |
+
print("\nextracting tables.. for future correlation use-cases ")
|
778 |
+
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
779 |
+
|
780 |
+
return papers, ids_none
|
781 |
+
|
782 |
+
def extract_parts(self, papers, txt_dir, dump_dir):
|
783 |
+
|
784 |
+
headings_all = {}
|
785 |
+
# refined = []
|
786 |
+
# model = build_summarizer()
|
787 |
+
#for file in glob.glob(txt_dir + '/*.txt'):
|
788 |
+
for p in papers:
|
789 |
+
file = txt_dir + '/'+ p['id'] +'.txt'
|
790 |
+
refined, headings_extracted = self.extract_headings(file)
|
791 |
+
sections = self.extract_sections(headings_extracted, refined)
|
792 |
+
# highlights = {k: extract_highlights(model,v) for k, v in sections.items()}
|
793 |
+
#p = self.get_by_file(file, papers)
|
794 |
+
#if len(headings_extracted) > 3:
|
795 |
+
p['body_text'] = sections
|
796 |
+
# p['body_highlights'] = highlights
|
797 |
+
headings_all[p['id']] = headings_extracted
|
798 |
+
|
799 |
+
ids_none = {i: h for i, h in headings_all.items() if len(h) < 3}
|
800 |
+
|
801 |
+
'''
|
802 |
+
for f, h in headings_all.items():
|
803 |
+
if len(h) < 4:
|
804 |
+
print("=================headings almost undetected================")
|
805 |
+
print(f)
|
806 |
+
print(h)
|
807 |
+
'''
|
808 |
+
# from pprint import pprint
|
809 |
+
# pprint({f: len(h) for f,h in headings_all.items()})
|
810 |
+
papers_none = [p for p in papers if p['id'] in ids_none]
|
811 |
+
for p in papers_none:
|
812 |
+
os.remove(txt_dir + '/'+ p['id'] + '.txt')
|
813 |
+
papers.remove(p)
|
814 |
+
|
815 |
+
return papers, ids_none
|
816 |
+
|
817 |
+
def check_para(self, df):
|
818 |
+
size = 0
|
819 |
+
for col in df.columns:
|
820 |
+
size += df[col].apply(lambda x: len(str(x))).median()
|
821 |
+
return size / len(df.columns) > 25
|
822 |
+
|
823 |
+
def scan_blocks(self, lines):
|
824 |
+
lines_mod = [line.strip().replace('\n', '') for line in lines if len(line.strip().replace('\n', '')) > 3]
|
825 |
+
for i in range(len(lines_mod)):
|
826 |
+
yield lines_mod[i:i + 3]
|
827 |
+
|
828 |
+
def extract_sections(self, headings, lines, min_part_length=2):
|
829 |
+
sections = {}
|
830 |
+
self.check_list_elems_in_list(headings, lines)
|
831 |
+
head_len = len(headings)
|
832 |
+
for i in range(len(headings) - 1):
|
833 |
+
start = headings[i]
|
834 |
+
end = headings[i + 1]
|
835 |
+
section = self.get_section(start, end, lines)
|
836 |
+
# print(start + " : "+ str(len(section)) +" lines")
|
837 |
+
'''
|
838 |
+
if i > 0:
|
839 |
+
old = headings[i-1]
|
840 |
+
if len(section) < min_part_length + 1:
|
841 |
+
sections[old].extend(start)
|
842 |
+
sections[old].extend(section)
|
843 |
+
else:
|
844 |
+
sections[start] = section
|
845 |
+
else:
|
846 |
+
sections[start] = section
|
847 |
+
'''
|
848 |
+
sections[start] = section
|
849 |
+
return {k: v for k, v in sections.items()}
|
850 |
+
|
851 |
+
def is_rubbish(self, s, rubbish_tolerance=0.2, min_char_len=4):
|
852 |
+
# numbers = sum(c.isdigit() for c in s)
|
853 |
+
letters = sum(c.isalpha() for c in s)
|
854 |
+
spaces = sum(c.isspace() for c in s)
|
855 |
+
# others = len(s) - numbers - letters - spaces
|
856 |
+
if len(s) == 0:
|
857 |
+
return False
|
858 |
+
if ((len(s) - (letters + spaces)) / len(s) >= rubbish_tolerance) or self.alpha_length(s) < min_char_len:
|
859 |
+
return True
|
860 |
+
else:
|
861 |
+
return False
|
862 |
+
|
863 |
+
def get_section(self, first, last, lines):
|
864 |
+
try:
|
865 |
+
assert (first in lines)
|
866 |
+
assert (last in lines)
|
867 |
+
# start = lines.index( first ) + len( first )
|
868 |
+
# end = lines.index( last, start )
|
869 |
+
start = [i for i in range(len(lines)) if first is lines[i]][0]
|
870 |
+
end = [i for i in range(len(lines)) if last is lines[i]][0]
|
871 |
+
section_lines = lines[start + 1:end]
|
872 |
+
# print("heading: " + str(first))
|
873 |
+
# print("section_lines: "+ str(section_lines))
|
874 |
+
# print(section_lines)
|
875 |
+
return section_lines
|
876 |
+
except ValueError:
|
877 |
+
print("value error :")
|
878 |
+
print("first heading :" + str(first) + ", second heading :" + str(last))
|
879 |
+
print("first index :" + str(start) + ", second index :" + str(end))
|
880 |
+
return ""
|
881 |
+
|
882 |
+
def check_list_elems_in_list(self, headings, lines):
|
883 |
+
import numpy as np
|
884 |
+
# [print(head) for head in headings if head not in lines ]
|
885 |
+
return np.all([True if head in lines else False for head in headings])
|
886 |
+
|
887 |
+
def check_first_char_upper(self, text):
|
888 |
+
for c in text:
|
889 |
+
if c.isspace():
|
890 |
+
continue
|
891 |
+
elif c.isalpha():
|
892 |
+
return c.isupper()
|
893 |
+
|
894 |
+
def extract_headings(self, txt_file):
|
895 |
+
import re
|
896 |
+
|
897 |
+
fulltext = self.read_paper(txt_file)
|
898 |
+
lines = self.clean_lines(fulltext)
|
899 |
+
|
900 |
+
refined, headings = self.scan_text(lines)
|
901 |
+
assert (self.check_list_elems_in_list(headings, refined))
|
902 |
+
headings = self.check_duplicates(headings)
|
903 |
+
|
904 |
+
# print('===========================================')
|
905 |
+
# print(txt_file +": first scan: \n"+str(len(headings))+" headings")
|
906 |
+
# print('\n'.join(headings))
|
907 |
+
|
908 |
+
# scan_failed - rescan with first match for abstract hook
|
909 |
+
if len(headings) == 0:
|
910 |
+
# print('===================')
|
911 |
+
# print("run 1 failed")
|
912 |
+
abs_cans = [line for line in lines if 'abstract' in re.sub("\s+", "", line.strip().lower())]
|
913 |
+
if len(abs_cans) != 0:
|
914 |
+
abs_head = abs_cans[0]
|
915 |
+
refined, headings = self.scan_text(lines, abs_head=abs_head)
|
916 |
+
self.check_list_elems_in_list(headings, refined)
|
917 |
+
headings = self.check_duplicates(headings)
|
918 |
+
# print('===================')
|
919 |
+
# print(txt_file +": second scan: \n"+str(len(headings))+" headings")
|
920 |
+
|
921 |
+
# if len(headings) == 0:
|
922 |
+
# print("heading scan failed completely")
|
923 |
+
|
924 |
+
return refined, headings
|
925 |
+
|
926 |
+
def check_duplicates(self, my_list):
|
927 |
+
my_finallist = []
|
928 |
+
dups = [s for s in my_list if my_list.count(s) > 1]
|
929 |
+
if len(dups) > 0:
|
930 |
+
[my_finallist.append(n) for n in my_list if n not in my_finallist]
|
931 |
+
|
932 |
+
# print("original: "+str(len(my_list))+" new: "+str(len(my_finallist)))
|
933 |
+
return my_finallist
|
934 |
+
|
935 |
+
def clean_lines(self, text):
|
936 |
+
import numpy as np
|
937 |
+
import re
|
938 |
+
# doc = nlp(text)
|
939 |
+
# lines = [str(sent) for sent in doc.sents]
|
940 |
+
lines = text.replace('\r', '').split('\n')
|
941 |
+
lines = [line for line in lines if not self.is_rubbish(line)]
|
942 |
+
lines = [line for line in lines if
|
943 |
+
re.match("^[a-zA-Z1-9\.\[\]\(\):\-,\"\"\s]*$", line) and not 'Figure' in line and not 'Table' in line]
|
944 |
+
|
945 |
+
lengths_cleaned = [self.alpha_length(line) for line in lines]
|
946 |
+
mean_length_cleaned = np.median(lengths_cleaned)
|
947 |
+
lines_standardized = []
|
948 |
+
for line in lines:
|
949 |
+
if len(line) >= (1.8 * mean_length_cleaned):
|
950 |
+
first_half = line[0:len(line) // 2]
|
951 |
+
second_half = line[len(line) // 2 if len(line) % 2 == 0 else ((len(line) // 2) + 1):]
|
952 |
+
lines_standardized.append(first_half)
|
953 |
+
lines_standardized.append(second_half)
|
954 |
+
else:
|
955 |
+
lines_standardized.append(line)
|
956 |
+
|
957 |
+
return lines
|
958 |
+
|
959 |
+
def scan_text(self, lines, abs_head=None):
|
960 |
+
import re
|
961 |
+
# print('\n'.join(lines))
|
962 |
+
record = False
|
963 |
+
headings = []
|
964 |
+
refined = []
|
965 |
+
for i in range(1, len(lines) - 4):
|
966 |
+
line = lines[i]
|
967 |
+
line = line.replace('\n', '').strip()
|
968 |
+
if 'abstract' in re.sub("\s+", "", line.strip().lower()) and len(line) - len('abstract') < 5 or (
|
969 |
+
abs_head is not None and abs_head in line):
|
970 |
+
record = True
|
971 |
+
headings.append(line)
|
972 |
+
refined.append(line)
|
973 |
+
if 'references' in re.sub("\s+", "", line.strip().lower()) and len(line) - len('references') < 5:
|
974 |
+
headings.append(line)
|
975 |
+
refined.append(line)
|
976 |
+
break
|
977 |
+
elif 'bibliography' in re.sub("\s+", "", line.strip().lower()) and len(line) - len('bibliography') < 5:
|
978 |
+
headings.append(line)
|
979 |
+
refined.append(line)
|
980 |
+
break
|
981 |
+
refined, headings = self.scanline(record, headings, refined, i, lines)
|
982 |
+
# print('=========in scan_text loop i : '+str(i)+' heading count : '+str(len(headings))+' =========')
|
983 |
+
return refined, headings
|
984 |
+
|
985 |
+
def scanline(self, record, headings, refined, id, lines):
|
986 |
+
import numpy as np
|
987 |
+
import re
|
988 |
+
line = lines[id]
|
989 |
+
|
990 |
+
if not len(line) == 0:
|
991 |
+
# print("in scanline")
|
992 |
+
# print(line)
|
993 |
+
if record:
|
994 |
+
refined.append(line)
|
995 |
+
if len(lines[id - 1]) == 0 or len(lines[id + 1]) == 0 or re.match(
|
996 |
+
"^[1-9XVIABCD]{0,4}(\.{0,1}[1-9XVIABCD]{0,4}){0, 3}\s{0,2}[A-Z][a-zA-Z\:\-\s]*$",
|
997 |
+
line) and self.char_length(line) > 7:
|
998 |
+
# print("candidate")
|
999 |
+
# print(line)
|
1000 |
+
if np.mean([len(s) for s in lines[id + 2:id + 6]]) > 40 and self.check_first_char_upper(
|
1001 |
+
line) and re.match("^[a-zA-Z1-9\.\:\-\s]*$", line) and len(line.split()) < 10:
|
1002 |
+
# if len(line) < 20 and np.mean([len(s) for s in lines[i+1:i+5]]) > 30 :
|
1003 |
+
headings.append(line)
|
1004 |
+
assert (line in refined)
|
1005 |
+
# print("selected")
|
1006 |
+
# print(line)
|
1007 |
+
else:
|
1008 |
+
known_headings = ['introduction', 'conclusion', 'abstract', 'references', 'bibliography']
|
1009 |
+
missing = [h for h in known_headings if not np.any([True for head in headings if h in head])]
|
1010 |
+
# for h in missing:
|
1011 |
+
head = [line for h in missing if h in re.sub("\s+", "", line.strip().lower())]
|
1012 |
+
# head = [line for known]
|
1013 |
+
if len(head) > 0:
|
1014 |
+
headings.append(head[0])
|
1015 |
+
assert (head[0] in refined)
|
1016 |
+
|
1017 |
+
return refined, headings
|
1018 |
+
|
1019 |
+
def char_length(self, s):
|
1020 |
+
# numbers = sum(c.isdigit() for c in s)
|
1021 |
+
letters = sum(c.isalpha() for c in s)
|
1022 |
+
# spaces = sum(c.isspace() for c in s)
|
1023 |
+
# others = len(s) - numbers - letters - spaces
|
1024 |
+
return letters
|
1025 |
+
|
1026 |
+
def get_by_file(self, file, papers):
|
1027 |
+
import os
|
1028 |
+
pid = os.path.basename(file)
|
1029 |
+
pid = pid.replace('.txt', '').replace('.pdf', '')
|
1030 |
+
for p in papers:
|
1031 |
+
if p['id'] == pid:
|
1032 |
+
return p
|
1033 |
+
print("\npaper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
|
1034 |
+
|
1035 |
+
|
1036 |
+
def alpha_length(self, s):
|
1037 |
+
# numbers = sum(c.isdigit() for c in s)
|
1038 |
+
letters = sum(c.isalpha() for c in s)
|
1039 |
+
spaces = sum(c.isspace() for c in s)
|
1040 |
+
# others = len(s) - numbers - letters - spaces
|
1041 |
+
return letters + spaces
|
1042 |
+
|
1043 |
+
def check_append(self, baselist, addstr):
|
1044 |
+
check = False
|
1045 |
+
for e in baselist:
|
1046 |
+
if addstr in e:
|
1047 |
+
check = True
|
1048 |
+
if not check:
|
1049 |
+
baselist.append(addstr)
|
1050 |
+
return baselist
|
1051 |
+
|
1052 |
+
def extract_images(self, papers, pdf_dir, img_dir):
|
1053 |
+
import fitz
|
1054 |
+
# print("in images")
|
1055 |
+
for p in papers:
|
1056 |
+
file = pdf_dir + p['id'] + ".pdf"
|
1057 |
+
pdf_file = fitz.open(file)
|
1058 |
+
images = []
|
1059 |
+
for page_index in range(len(pdf_file)):
|
1060 |
+
page = pdf_file[page_index]
|
1061 |
+
images.extend(page.getImageList())
|
1062 |
+
images_files = [self.save_image(pdf_file.extractImage(img[0]), i, p['id'], img_dir) for i, img in
|
1063 |
+
enumerate(set(images)) if img[0]]
|
1064 |
+
# print(len(images_per_paper))
|
1065 |
+
p['images'] = images_files
|
1066 |
+
# print(len(p.keys()))
|
1067 |
+
# print(papers[0].keys())
|
1068 |
+
return papers
|
1069 |
+
|
1070 |
+
|
1071 |
+
def extract_images_from_file(self, pdf_file_name, img_dir):
|
1072 |
+
import fitz
|
1073 |
+
pdf_file = fitz.open(pdf_file_name)
|
1074 |
+
images = []
|
1075 |
+
for page_index in range(len(pdf_file)):
|
1076 |
+
page = pdf_file[page_index]
|
1077 |
+
images.extend(page.getImageList())
|
1078 |
+
images_files = [self.save_image(pdf_file.extractImage(img[0]), i, pdf_file_name.replace('.pdf', ''), img_dir) for i, img in
|
1079 |
+
enumerate(set(images)) if img[0]]
|
1080 |
+
return images_files
|
1081 |
+
|
1082 |
+
def save_image(self, base_image, img_index, pid, img_dir):
|
1083 |
+
from PIL import Image
|
1084 |
+
import io
|
1085 |
+
image_bytes = base_image["image"]
|
1086 |
+
# get the image extension
|
1087 |
+
image_ext = base_image["ext"]
|
1088 |
+
# load it to PIL
|
1089 |
+
image = Image.open(io.BytesIO(image_bytes))
|
1090 |
+
# save it to local disk
|
1091 |
+
fname = img_dir + "/" + str(pid) + "_" + str(img_index + 1) + "." + image_ext
|
1092 |
+
image.save(open(f"{fname}", "wb"))
|
1093 |
+
# print(fname)
|
1094 |
+
return fname
|
1095 |
+
|
1096 |
+
def save_tables(self, dfs, pid, tab_dir):
|
1097 |
+
# todo
|
1098 |
+
dfs = [df for df in dfs if not self.check_para(df)]
|
1099 |
+
files = []
|
1100 |
+
for df in dfs:
|
1101 |
+
filename = tab_dir + "/" + str(pid) + ".csv"
|
1102 |
+
files.append(filename)
|
1103 |
+
df.to_csv(filename, index=False)
|
1104 |
+
return files
|
1105 |
+
|
1106 |
+
def extract_tables(self, papers, pdf_dir, tab_dir):
|
1107 |
+
import tabula
|
1108 |
+
check = True
|
1109 |
+
# for file in glob.glob(pdf_dir+'/*.pdf'):
|
1110 |
+
for p in papers:
|
1111 |
+
dfs = tabula.read_pdf(pdf_dir + p['id'] + ".pdf", pages='all', multiple_tables=True, silent=True)
|
1112 |
+
p['tables'] = self.save_tables(dfs, p['id'], tab_dir)
|
1113 |
+
# print(papers[0].keys())
|
1114 |
+
return papers
|
1115 |
+
|
1116 |
+
def extract_tables_from_file(self, pdf_file_name, tab_dir):
|
1117 |
+
import tabula
|
1118 |
+
check = True
|
1119 |
+
# for file in glob.glob(pdf_dir+'/*.pdf'):
|
1120 |
+
dfs = tabula.read_pdf(pdf_file_name, pages='all', multiple_tables=True, silent=True)
|
1121 |
+
|
1122 |
+
return self.save_tables(dfs, pdf_file_name.replace('.pdf', ''), tab_dir)
|
1123 |
+
|
1124 |
+
def search(self, query_text=None, id_list=None, max_search=100):
|
1125 |
+
import arxiv
|
1126 |
+
from urllib.parse import urlparse
|
1127 |
+
|
1128 |
+
if query_text:
|
1129 |
+
search = arxiv.Search(
|
1130 |
+
query=query_text,
|
1131 |
+
max_results=max_search,
|
1132 |
+
sort_by=arxiv.SortCriterion.Relevance
|
1133 |
+
)
|
1134 |
+
else:
|
1135 |
+
id_list = [id for id in id_list if '.' in id]
|
1136 |
+
search = arxiv.Search(
|
1137 |
+
id_list=id_list
|
1138 |
+
)
|
1139 |
+
|
1140 |
+
results = [result for result in search.get()]
|
1141 |
+
|
1142 |
+
searched_papers = []
|
1143 |
+
discarded_ids = []
|
1144 |
+
for result in results:
|
1145 |
+
id = urlparse(result.entry_id).path.split('/')[-1].split('v')[0]
|
1146 |
+
if '.' in id:
|
1147 |
+
paper = {
|
1148 |
+
'id': id,
|
1149 |
+
'title': result.title,
|
1150 |
+
'comments': result.comment if result.journal_ref else "None",
|
1151 |
+
'journal-ref': result.journal_ref if result.journal_ref else "None",
|
1152 |
+
'doi': str(result.doi),
|
1153 |
+
'primary_category': result.primary_category,
|
1154 |
+
'categories': result.categories,
|
1155 |
+
'license': None,
|
1156 |
+
'abstract': result.summary,
|
1157 |
+
'published': result.published,
|
1158 |
+
'pdf_url': result.pdf_url,
|
1159 |
+
'links': [str(l) for l in result.links],
|
1160 |
+
'update_date': result.updated,
|
1161 |
+
'authors': [str(a.name) for a in result.authors],
|
1162 |
+
}
|
1163 |
+
searched_papers.append(paper)
|
1164 |
+
else:
|
1165 |
+
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
1166 |
+
|
1167 |
+
print("\nPapers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
|
1168 |
+
|
1169 |
+
return results, searched_papers
|
1170 |
+
|
1171 |
+
def download_pdfs(self, papers, pdf_dir):
|
1172 |
+
import arxiv
|
1173 |
+
from urllib.parse import urlparse
|
1174 |
+
ids = [p['id'] for p in papers]
|
1175 |
+
print("\ndownloading below selected papers: ")
|
1176 |
+
print(ids)
|
1177 |
+
# asert(False)
|
1178 |
+
papers_filtered = arxiv.Search(id_list=ids).get()
|
1179 |
+
for p in papers_filtered:
|
1180 |
+
p_id = str(urlparse(p.entry_id).path.split('/')[-1]).split('v')[0]
|
1181 |
+
download_file = pdf_dir + "/" + p_id + ".pdf"
|
1182 |
+
p.download_pdf(filename=download_file)
|
1183 |
+
|
1184 |
+
|
1185 |
+
def download_sources(self, papers, src_dir):
|
1186 |
+
import arxiv
|
1187 |
+
from urllib.parse import urlparse
|
1188 |
+
ids = [p['id'] for p in papers]
|
1189 |
+
print(ids)
|
1190 |
+
# asert(False)
|
1191 |
+
papers_filtered = arxiv.Search(id_list=ids).get()
|
1192 |
+
for p in papers_filtered:
|
1193 |
+
p_id = str(urlparse(p.entry_id).path.split('/')[-1]).split('v')[0]
|
1194 |
+
download_file = src_dir + "/" + p_id + ".tar.gz"
|
1195 |
+
p.download_source(filename=download_file)
|
1196 |
+
|
1197 |
+
def convert_pdfs(self, pdf_dir, txt_dir):
|
1198 |
+
import glob, shutil
|
1199 |
+
|
1200 |
+
import multiprocessing
|
1201 |
+
# import arxiv_public_data
|
1202 |
+
|
1203 |
+
convert_directory_parallel(pdf_dir, multiprocessing.cpu_count())
|
1204 |
+
for file in glob.glob(pdf_dir + '/*.txt'):
|
1205 |
+
shutil.move(file, txt_dir)
|
1206 |
+
|
1207 |
+
def read_paper(self, path):
|
1208 |
+
f = open(path, 'r', encoding="utf-8")
|
1209 |
+
text = str(f.read())
|
1210 |
+
f.close()
|
1211 |
+
return text
|
1212 |
+
|
1213 |
+
def cocitation_network(self, papers, txt_dir):
|
1214 |
+
import multiprocessing
|
1215 |
+
|
1216 |
+
|
1217 |
+
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
1218 |
+
print("\ncitation-network: ")
|
1219 |
+
print(cites)
|
1220 |
+
|
1221 |
+
for p in papers:
|
1222 |
+
p['cites'] = cites[p['id']]
|
1223 |
+
return papers, cites
|
1224 |
+
|
1225 |
+
def lookup_author(self, author_query):
|
1226 |
+
|
1227 |
+
from scholarly import scholarly
|
1228 |
+
import operator
|
1229 |
+
# Retrieve the author's data, fill-in, and print
|
1230 |
+
print("Searching Author: " + author_query)
|
1231 |
+
search_result = next(scholarly.search_author(author_query), None)
|
1232 |
+
|
1233 |
+
if search_result is not None:
|
1234 |
+
author = scholarly.fill(search_result)
|
1235 |
+
author_stats = {
|
1236 |
+
'name': author_query,
|
1237 |
+
'affiliation': author['affiliation'] if author['affiliation'] else None,
|
1238 |
+
'citedby': author['citedby'] if 'citedby' in author.keys() else 0,
|
1239 |
+
'most_cited_year': max(author['cites_per_year'].items(), key=operator.itemgetter(1))[0] if len(
|
1240 |
+
author['cites_per_year']) > 0 else None,
|
1241 |
+
'coauthors': [c['name'] for c in author['coauthors']],
|
1242 |
+
'hindex': author['hindex'],
|
1243 |
+
'impact': author['i10index'],
|
1244 |
+
'interests': author['interests'],
|
1245 |
+
'publications': [{'title': p['bib']['title'], 'citations': p['num_citations']} for p in
|
1246 |
+
author['publications']],
|
1247 |
+
'url_picture': author['url_picture'],
|
1248 |
+
}
|
1249 |
+
else:
|
1250 |
+
print("author not found")
|
1251 |
+
author_stats = {
|
1252 |
+
'name': author_query,
|
1253 |
+
'affiliation': "",
|
1254 |
+
'citedby': 0,
|
1255 |
+
'most_cited_year': None,
|
1256 |
+
'coauthors': [],
|
1257 |
+
'hindex': 0,
|
1258 |
+
'impact': 0,
|
1259 |
+
'interests': [],
|
1260 |
+
'publications': [],
|
1261 |
+
'url_picture': "",
|
1262 |
+
}
|
1263 |
+
|
1264 |
+
# pprint(author_stats)
|
1265 |
+
return author_stats
|
1266 |
+
|
1267 |
+
def author_stats(self, papers):
|
1268 |
+
all_authors = []
|
1269 |
+
for p in papers:
|
1270 |
+
paper_authors = [a for a in p['authors']]
|
1271 |
+
all_authors.extend(paper_authors)
|
1272 |
+
|
1273 |
+
searched_authors = [self.lookup_author(a) for a in set(all_authors)]
|
1274 |
+
|
1275 |
+
return searched_authors
|
1276 |
+
|
1277 |
+
def text_similarity(self, text1, text2):
|
1278 |
+
doc1 = self.similarity_nlp(text1)
|
1279 |
+
doc2 = self.similarity_nlp(text2)
|
1280 |
+
return doc1.similarity(doc2)
|
1281 |
+
|
1282 |
+
def text_para_similarity(self, text, lines):
|
1283 |
+
doc1 = self.similarity_nlp(text)
|
1284 |
+
doc2 = self.similarity_nlp(" ".join(lines))
|
1285 |
+
return doc1.similarity(doc2)
|
1286 |
+
|
1287 |
+
def para_para_similarity(self, lines1, lines2):
|
1288 |
+
doc1 = self.similarity_nlp(" ".join(lines1))
|
1289 |
+
doc2 = self.similarity_nlp(" ".join(lines2))
|
1290 |
+
return doc1.similarity(doc2)
|
1291 |
+
|
1292 |
+
def text_image_similarity(self, text, image):
|
1293 |
+
pass
|
1294 |
+
|
1295 |
+
def ask(self, corpus, question):
|
1296 |
+
text = " ".join(corpus)
|
1297 |
+
import torch
|
1298 |
+
inputs = self.qatokenizer(question, text, return_tensors='pt')
|
1299 |
+
start_positions = torch.tensor([1])
|
1300 |
+
end_positions = torch.tensor([3])
|
1301 |
+
outputs = self.qamodel(**inputs, start_positions=start_positions, end_positions=end_positions)
|
1302 |
+
print("context: " + text)
|
1303 |
+
print("question: " + question)
|
1304 |
+
print("outputs: " + outputs)
|
1305 |
+
return outputs
|
1306 |
+
|
1307 |
+
def zip_outputs(self, dump_dir, query):
|
1308 |
+
import zipfile
|
1309 |
+
def zipdir(path, ziph):
|
1310 |
+
# ziph is zipfile handle
|
1311 |
+
for root, dirs, files in os.walk(path):
|
1312 |
+
for file in files:
|
1313 |
+
ziph.write(os.path.join(root, file),
|
1314 |
+
os.path.relpath(os.path.join(root, file),
|
1315 |
+
os.path.join(path, '../..')))
|
1316 |
+
|
1317 |
+
zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
|
1318 |
+
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
|
1319 |
+
zipdir(dump_dir, zipf)
|
1320 |
+
return zip_name
|
1321 |
+
|
1322 |
+
def survey(self, query, max_search=None, num_papers=None, debug=False, weigh_authors=False):
|
1323 |
+
import joblib
|
1324 |
+
import os, shutil
|
1325 |
+
if not max_search:
|
1326 |
+
max_search = DEFAULTS['max_search']
|
1327 |
+
if not num_papers:
|
1328 |
+
num_papers = DEFAULTS['num_papers']
|
1329 |
+
# arxiv api relevance search and data preparation
|
1330 |
+
print("\nsearching arXiv for top 100 papers.. ")
|
1331 |
+
results, searched_papers = self.search(query, max_search=max_search)
|
1332 |
+
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
1333 |
+
print("\nfound " + str(len(searched_papers)) + " papers")
|
1334 |
+
|
1335 |
+
# paper selection by scibert vector embedding relevance scores
|
1336 |
+
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
1337 |
+
|
1338 |
+
papers_highlighted, papers_selected = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
|
1339 |
+
searched_papers)
|
1340 |
+
|
1341 |
+
if weigh_authors:
|
1342 |
+
authors = self.author_stats(papers_highlighted)
|
1343 |
+
|
1344 |
+
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
1345 |
+
|
1346 |
+
print("\nStandardizing known section headings per paper.. ")
|
1347 |
+
papers_standardized = self.standardize_headings(papers_highlighted)
|
1348 |
+
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
1349 |
+
|
1350 |
+
print("\nBuilding paper-wise corpus.. ")
|
1351 |
+
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
1352 |
+
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
1353 |
+
|
1354 |
+
print("\nBuilding section-wise corpus.. ")
|
1355 |
+
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
1356 |
+
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
1357 |
+
|
1358 |
+
print("\nBuilding basic research highlights.. ")
|
1359 |
+
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
1360 |
+
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
1361 |
+
|
1362 |
+
print("\nReducing corpus to lines.. ")
|
1363 |
+
corpus_lines = self.get_corpus_lines(corpus)
|
1364 |
+
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
1365 |
+
|
1366 |
+
# temp
|
1367 |
+
# searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
|
1368 |
+
'''
|
1369 |
+
papers_highlighted = joblib.load(dump_dir + 'papers_highlighted.dmp')
|
1370 |
+
corpus = joblib.load(dump_dir + 'corpus.dmp')
|
1371 |
+
papers_standardized = joblib.load(dump_dir + 'papers_standardized.dmp')
|
1372 |
+
corpus_sectionwise = joblib.load(dump_dir + 'corpus_sectionwise.dmp')
|
1373 |
+
research_blocks = joblib.load(dump_dir + 'research_blocks.dmp')
|
1374 |
+
corpus_lines = joblib.load(dump_dir + 'corpus_lines.dmp')
|
1375 |
+
'''
|
1376 |
+
|
1377 |
+
'''
|
1378 |
+
print("papers_highlighted types:"+ str(np.unique([str(type(p['sections'][0]['highlights'])) for p in papers_highlighted])))
|
1379 |
+
print("papers_highlighted example:")
|
1380 |
+
print(random.sample(list(papers_highlighted), 1)[0]['sections'][0]['highlights'])
|
1381 |
+
print("corpus types:"+ str(np.unique([str(type(txt)) for k,txt in corpus.items()])))
|
1382 |
+
print("corpus example:")
|
1383 |
+
print(random.sample(list(corpus.items()), 1)[0])
|
1384 |
+
print("corpus_lines types:"+ str(np.unique([str(type(txt)) for txt in corpus_lines])))
|
1385 |
+
print("corpus_lines example:")
|
1386 |
+
print(random.sample(list(corpus_lines), 1)[0])
|
1387 |
+
print("corpus_sectionwise types:"+ str(np.unique([str(type(txt)) for k,txt in corpus_sectionwise.items()])))
|
1388 |
+
print("corpus_sectionwise example:")
|
1389 |
+
print(random.sample(list(corpus_sectionwise.items()), 1)[0])
|
1390 |
+
print("research_blocks types:"+ str(np.unique([str(type(txt)) for k,txt in research_blocks.items()])))
|
1391 |
+
print("research_blocks example:")
|
1392 |
+
print(random.sample(list(research_blocks.items()), 1)[0])
|
1393 |
+
'''
|
1394 |
+
# print("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
|
1395 |
+
|
1396 |
+
print("\nBuilding abstract.. ")
|
1397 |
+
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
1398 |
+
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
1399 |
+
'''
|
1400 |
+
print("abstract_block type:"+ str(type(abstract_block)))
|
1401 |
+
print("abstract_block:")
|
1402 |
+
print(abstract_block)
|
1403 |
+
'''
|
1404 |
+
|
1405 |
+
print("\nBuilding introduction.. ")
|
1406 |
+
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
1407 |
+
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
1408 |
+
'''
|
1409 |
+
print("intro_block type:"+ str(type(intro_block)))
|
1410 |
+
print("intro_block:")
|
1411 |
+
print(intro_block)
|
1412 |
+
'''
|
1413 |
+
print("\nBuilding custom sections.. ")
|
1414 |
+
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
1415 |
+
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
1416 |
+
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
1417 |
+
|
1418 |
+
'''
|
1419 |
+
print("clusters extracted")
|
1420 |
+
print("clustered_sentences types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sentences.items()])))
|
1421 |
+
print("clustered_sentences example:")
|
1422 |
+
print(random.sample(list(clustered_sections.items()), 1)[0])
|
1423 |
+
print("clustered_sections types:"+ str(np.unique([str(type(txt)) for k,txt in clustered_sections.items()])))
|
1424 |
+
print("clustered_sections example:")
|
1425 |
+
print(random.sample(list(clustered_sections.items()), 1)[0])
|
1426 |
+
'''
|
1427 |
+
clustered_sections['abstract'] = abstract_block
|
1428 |
+
clustered_sections['introduction'] = intro_block
|
1429 |
+
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
1430 |
+
|
1431 |
+
print("\nBuilding conclusion.. ")
|
1432 |
+
conclusion_block = self.get_conclusion(clustered_sections)
|
1433 |
+
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
1434 |
+
clustered_sections['conclusion'] = conclusion_block
|
1435 |
+
'''
|
1436 |
+
print("conclusion_block type:"+ str(type(conclusion_block)))
|
1437 |
+
print("conclusion_block:")
|
1438 |
+
print(conclusion_block)
|
1439 |
+
'''
|
1440 |
+
|
1441 |
+
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
1442 |
+
self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
|
1443 |
+
|
1444 |
+
shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
|
1445 |
+
shutil.copy(self.dump_dir + survey_file, survey_file)
|
1446 |
+
assert (os.path.exists(survey_file))
|
1447 |
+
output_zip = self.zip_outputs(self.dump_dir, query)
|
1448 |
+
print("\nSurvey complete.. \nSurvey file path :" + os.path.abspath(
|
1449 |
+
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
1450 |
+
|
1451 |
+
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|
1452 |
+
|
1453 |
+
|
1454 |
+
if __name__ == '__main__':
|
1455 |
+
import argparse
|
1456 |
+
|
1457 |
+
parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
|
1458 |
+
parser.add_argument('query', metavar='query_string', type=str,
|
1459 |
+
help='your research query/keywords')
|
1460 |
+
parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
|
1461 |
+
help='maximium number of papers to gaze at - defaults to 100')
|
1462 |
+
parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
|
1463 |
+
help='maximium number of papers to download and analyse - defaults to 25')
|
1464 |
+
parser.add_argument('--pdf_dir', metavar='pdf_dir', type=str, default=None,
|
1465 |
+
help='pdf paper storage directory - defaults to arxiv_data/tarpdfs/')
|
1466 |
+
parser.add_argument('--txt_dir', metavar='txt_dir', type=str, default=None,
|
1467 |
+
help='text-converted paper storage directory - defaults to arxiv_data/fulltext/')
|
1468 |
+
parser.add_argument('--img_dir', metavar='img_dir', type=str, default=None,
|
1469 |
+
help='image storage directory - defaults to arxiv_data/images/')
|
1470 |
+
parser.add_argument('--tab_dir', metavar='tab_dir', type=str, default=None,
|
1471 |
+
help='tables storage directory - defaults to arxiv_data/tables/')
|
1472 |
+
parser.add_argument('--dump_dir', metavar='dump_dir', type=str, default=None,
|
1473 |
+
help='all_output_dir - defaults to arxiv_dumps/')
|
1474 |
+
parser.add_argument('--models_dir', metavar='save_models_dir', type=str, default=None,
|
1475 |
+
help='directory to save models (> 5GB) - defaults to saved_models/')
|
1476 |
+
parser.add_argument('--title_model_name', metavar='title_model_name', type=str, default=None,
|
1477 |
+
help='title model name/tag in hugging-face, defaults to \'Callidior/bert2bert-base-arxiv-titlegen\'')
|
1478 |
+
parser.add_argument('--ex_summ_model_name', metavar='extractive_summ_model_name', type=str, default=None,
|
1479 |
+
help='extractive summary model name/tag in hugging-face, defaults to \'allenai/scibert_scivocab_uncased\'')
|
1480 |
+
parser.add_argument('--ledmodel_name', metavar='ledmodel_name', type=str, default=None,
|
1481 |
+
help='led model(for abstractive summary) name/tag in hugging-face, defaults to \'allenai/led-large-16384-arxiv\'')
|
1482 |
+
parser.add_argument('--embedder_name', metavar='sentence_embedder_name', type=str, default=None,
|
1483 |
+
help='sentence embedder name/tag in hugging-face, defaults to \'paraphrase-MiniLM-L6-v2\'')
|
1484 |
+
parser.add_argument('--nlp_name', metavar='spacy_model_name', type=str, default=None,
|
1485 |
+
help='spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_scibert\'')
|
1486 |
+
parser.add_argument('--similarity_nlp_name', metavar='similarity_nlp_name', type=str, default=None,
|
1487 |
+
help='spacy downstream model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_lg\'')
|
1488 |
+
parser.add_argument('--kw_model_name', metavar='kw_model_name', type=str, default=None,
|
1489 |
+
help='keyword extraction model name/tag in hugging-face, defaults to \'distilbert-base-nli-mean-tokens\'')
|
1490 |
+
parser.add_argument('--refresh_models', metavar='refresh_models', type=str, default=None,
|
1491 |
+
help='Refresh model downloads with given names (needs atleast one model name param above), defaults to False')
|
1492 |
+
parser.add_argument('--high_gpu', metavar='high_gpu', type=str, default=None,
|
1493 |
+
help='High GPU usage permitted, defaults to False')
|
1494 |
+
|
1495 |
+
args = parser.parse_args()
|
1496 |
+
|
1497 |
+
surveyor = Surveyor(
|
1498 |
+
pdf_dir=args.pdf_dir,
|
1499 |
+
txt_dir=args.txt_dir,
|
1500 |
+
img_dir=args.img_dir,
|
1501 |
+
tab_dir=args.tab_dir,
|
1502 |
+
dump_dir=args.dump_dir,
|
1503 |
+
models_dir=args.models_dir,
|
1504 |
+
title_model_name=args.title_model_name,
|
1505 |
+
ex_summ_model_name=args.ex_summ_model_name,
|
1506 |
+
ledmodel_name=args.ledmodel_name,
|
1507 |
+
embedder_name=args.embedder_name,
|
1508 |
+
nlp_name=args.nlp_name,
|
1509 |
+
similarity_nlp_name=args.similarity_nlp_name,
|
1510 |
+
kw_model_name=args.kw_model_name,
|
1511 |
+
refresh_models=args.refresh_models,
|
1512 |
+
high_gpu=args.high_gpu
|
1513 |
+
|
1514 |
+
)
|
1515 |
+
|
1516 |
+
surveyor.survey(args.query, max_search=args.max_search, num_papers=args.num_papers,
|
1517 |
+
debug=False, weigh_authors=False)
|
1518 |
+
|
src/__pycache__/Surveyor.cpython-310.pyc
ADDED
Binary file (47.8 kB). View file
|
|
src/__pycache__/defaults.cpython-310.pyc
ADDED
Binary file (835 Bytes). View file
|
|
src/defaults.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# defaults for arxiv
|
2 |
+
DEFAULTS = {
|
3 |
+
"max_search": 100,
|
4 |
+
"num_papers": 20,
|
5 |
+
"high_gpu": False,
|
6 |
+
"pdf_dir": "arxiv_data/tarpdfs/",
|
7 |
+
"txt_dir": "arxiv_data/fulltext/",
|
8 |
+
"img_dir": "arxiv_data/images/",
|
9 |
+
"tab_dir": "arxiv_data/tables/",
|
10 |
+
"dump_dir": "arxiv_dumps/",
|
11 |
+
"models_dir": "saved_models/",
|
12 |
+
"title_model_name": "Callidior/bert2bert-base-arxiv-titlegen",
|
13 |
+
"ex_summ_model_name": "allenai/scibert_scivocab_uncased",
|
14 |
+
"ledmodel_name": "allenai/led-large-16384-arxiv",
|
15 |
+
"embedder_name": "paraphrase-MiniLM-L6-v2",
|
16 |
+
"nlp_name": "en_core_sci_scibert",
|
17 |
+
"similarity_nlp_name": "en_core_sci_lg",
|
18 |
+
"kw_model_name": "distilbert-base-nli-mean-tokens",
|
19 |
+
|
20 |
+
}
|
src/packages.txt
ADDED
File without changes
|
survey.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.Surveyor import Surveyor
|
2 |
+
|
3 |
+
import logging
|
4 |
+
logging.basicConfig()
|
5 |
+
logging.getLogger().setLevel(logging.ERROR)
|
6 |
+
|
7 |
+
|
8 |
+
if __name__ == '__main__':
|
9 |
+
import argparse
|
10 |
+
|
11 |
+
parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
|
12 |
+
parser.add_argument('query', metavar='query_string', type=str,
|
13 |
+
help='your research query/keywords')
|
14 |
+
parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
|
15 |
+
help='maximium number of papers to gaze at - defaults to 100')
|
16 |
+
parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
|
17 |
+
help='maximium number of papers to download and analyse - defaults to 25')
|
18 |
+
parser.add_argument('--pdf_dir', metavar='pdf_dir', type=str, default=None,
|
19 |
+
help='pdf paper storage directory - defaults to arxiv_data/tarpdfs/')
|
20 |
+
parser.add_argument('--txt_dir', metavar='txt_dir', type=str, default=None,
|
21 |
+
help='text-converted paper storage directory - defaults to arxiv_data/fulltext/')
|
22 |
+
parser.add_argument('--img_dir', metavar='img_dir', type=str, default=None,
|
23 |
+
help='image storage directory - defaults to arxiv_data/images/')
|
24 |
+
parser.add_argument('--tab_dir', metavar='tab_dir', type=str, default=None,
|
25 |
+
help='tables storage directory - defaults to arxiv_data/tables/')
|
26 |
+
parser.add_argument('--dump_dir', metavar='dump_dir', type=str, default=None,
|
27 |
+
help='all_output_dir - defaults to arxiv_dumps/')
|
28 |
+
parser.add_argument('--models_dir', metavar='save_models_dir', type=str, default=None,
|
29 |
+
help='directory to save models (> 5GB) - defaults to saved_models/')
|
30 |
+
parser.add_argument('--title_model_name', metavar='title_model_name', type=str, default=None,
|
31 |
+
help='title model name/tag in hugging-face, defaults to \'Callidior/bert2bert-base-arxiv-titlegen\'')
|
32 |
+
parser.add_argument('--ex_summ_model_name', metavar='extractive_summ_model_name', type=str, default=None,
|
33 |
+
help='extractive summary model name/tag in hugging-face, defaults to \'allenai/scibert_scivocab_uncased\'')
|
34 |
+
parser.add_argument('--ledmodel_name', metavar='ledmodel_name', type=str, default=None,
|
35 |
+
help='led model(for abstractive summary) name/tag in hugging-face, defaults to \'allenai/led-large-16384-arxiv\'')
|
36 |
+
parser.add_argument('--embedder_name', metavar='sentence_embedder_name', type=str, default=None,
|
37 |
+
help='sentence embedder name/tag in hugging-face, defaults to \'paraphrase-MiniLM-L6-v2\'')
|
38 |
+
parser.add_argument('--nlp_name', metavar='spacy_model_name', type=str, default=None,
|
39 |
+
help='spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_scibert\'')
|
40 |
+
parser.add_argument('--similarity_nlp_name', metavar='similarity_nlp_name', type=str, default=None,
|
41 |
+
help='spacy downstream model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_lg\'')
|
42 |
+
parser.add_argument('--kw_model_name', metavar='kw_model_name', type=str, default=None,
|
43 |
+
help='keyword extraction model name/tag in hugging-face, defaults to \'distilbert-base-nli-mean-tokens\'')
|
44 |
+
parser.add_argument('--refresh_models', metavar='refresh_models', type=str, default=None,
|
45 |
+
help='Refresh model downloads with given names (needs atleast one model name param above), defaults to False')
|
46 |
+
parser.add_argument('--high_gpu', metavar='high_gpu', type=str, default=None,
|
47 |
+
help='High GPU usage permitted, defaults to False')
|
48 |
+
|
49 |
+
args = parser.parse_args()
|
50 |
+
|
51 |
+
surveyor = Surveyor(
|
52 |
+
pdf_dir=args.pdf_dir,
|
53 |
+
txt_dir=args.txt_dir,
|
54 |
+
img_dir=args.img_dir,
|
55 |
+
tab_dir=args.tab_dir,
|
56 |
+
dump_dir=args.dump_dir,
|
57 |
+
models_dir=args.models_dir,
|
58 |
+
title_model_name=args.title_model_name,
|
59 |
+
ex_summ_model_name=args.ex_summ_model_name,
|
60 |
+
ledmodel_name=args.ledmodel_name,
|
61 |
+
embedder_name=args.embedder_name,
|
62 |
+
nlp_name=args.nlp_name,
|
63 |
+
similarity_nlp_name=args.similarity_nlp_name,
|
64 |
+
kw_model_name=args.kw_model_name,
|
65 |
+
refresh_models=args.refresh_models,
|
66 |
+
high_gpu=args.high_gpu
|
67 |
+
|
68 |
+
)
|
69 |
+
|
70 |
+
surveyor.survey(args.query, max_search=args.max_search, num_papers=args.num_papers,
|
71 |
+
debug=False, weigh_authors=False)
|
72 |
+
|
tests/__init__.py
ADDED
File without changes
|
tests/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (136 Bytes). View file
|
|
tests/__pycache__/test_survey_files.cpython-310-pytest-7.1.2.pyc
ADDED
Binary file (1.21 kB). View file
|
|
tests/test_survey_files.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from src.Surveyor import Surveyor
|
3 |
+
|
4 |
+
def test_files():
|
5 |
+
surveyor = Surveyor()
|
6 |
+
sample_query = 'quantum entanglement'
|
7 |
+
zip_file, survey_file = surveyor.survey(sample_query, max_search=10, num_papers=6,
|
8 |
+
debug=False, weigh_authors=False)
|
9 |
+
assert os.path.exists(zip_file)
|
10 |
+
assert os.path.exists(survey_file)
|