Spaces:
Build error
Build error
""" | |
util.py | |
author: Colin Clement | |
date: 2019-04-05 | |
This module contains helper functions for loading embeddings and batch | |
loading the full text, since many computers cannot contain the whole | |
fulltext in memory. | |
""" | |
import os | |
import re | |
import numpy as np | |
import pickle | |
from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT | |
from arxiv_public_data.oai_metadata import load_metadata | |
def id_to_pathname(aid): | |
""" | |
Make filename path for text document, matching the format of fulltext | |
creation in `s3_bulk_download` | |
Parameters | |
---------- | |
aid : str | |
string of arXiv article id as found in metadata | |
Returns | |
------- | |
pathname : str | |
pathname in which to store the article following | |
Examples | |
-------- | |
>>> id_to_pathname('hep-ph/0001001') #doctest: +ELLIPSIS | |
'.../hep-ph/0001/hep-ph0001001.txt' | |
>>> id_to_pathname('1501.13851') #doctest: +ELLIPSIS | |
'.../arxiv/1501/1501.13851.txt' | |
""" | |
if '.' in aid: # new style ArXiv ID | |
yymm = aid.split('.')[0] | |
return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt') | |
# old style ArXiv ID | |
cat, arxiv_id = re.split(r'(\d+)', aid)[:2] | |
yymm = arxiv_id[:4] | |
return os.path.join(DIR_FULLTEXT, cat, yymm, aid.replace('/', '') + '.txt') | |
def load_generator(paths, batchsize): | |
""" | |
Creates a generator object for batch loading files from paths | |
Parameters | |
---------- | |
paths : list of filepaths | |
batchsize : int | |
Returns | |
------- | |
file_contents : list of strings of contents of files in path | |
""" | |
assert type(paths) is list, 'Requires a list of paths' | |
assert type(batchsize) is int, 'batchsize must be an int' | |
assert batchsize > 0, 'batchsize must be positive' | |
out = [] | |
for p in paths: | |
with open(p, 'r') as fin: | |
out.append(fin.read()) | |
if len(out) == batchsize: | |
yield np.array(out, dtype='object') | |
out = [] | |
yield out | |
def batch_fulltext(batchsize=32, maxnum=None): | |
""" | |
Read metadata and find corresponding files in the fulltext | |
Parameters | |
---------- | |
(optional) | |
batchsize : int | |
number of fulltext files to load into a batch | |
maxnum : int | |
the maximum number of paths to feed the generator, for | |
testing purposes | |
Returns | |
------- | |
md_index, all_ids, load_gen : tuple of (list, list, generator) | |
md_index is a mapping of existing fulltext files, in order | |
of their appearance, and containing the index of corresponding | |
metadata. all_ids is a list of all arXiv IDs in the metadata. | |
load_gen is a generator which allows batched loading of the | |
full-text, as defined by `load_generator` | |
""" | |
all_ids = [m['id'] for m in load_metadata()] | |
all_paths = [id_to_pathname(aid) for aid in all_ids] | |
exists = [os.path.exists(p) for p in all_paths] | |
existing_paths = [p for p, e in zip(all_paths, exists) if e][:maxnum] | |
md_index = [i for i, e in enumerate(exists) if e] | |
return md_index, all_ids, load_generator(existing_paths, batchsize) | |
def load_embeddings(filename, headers=0): | |
""" | |
Loads vector embeddings | |
Parameters | |
---------- | |
filename : str | |
path to vector embeddings saved by `create_save_embeddings` | |
(optional) | |
headers : int | |
number of pickle calls containing metadata separate from the graphs | |
Returns | |
------- | |
embeddings : dict | |
keys 'embeddings' containing vector embeddings and | |
'headers' containining metadata | |
""" | |
out = {'embeddings': [], 'headers': []} | |
N = 0 | |
with open(filename, 'rb') as fin: | |
while True: | |
try: | |
if N < headers: | |
out['headers'].append(pickle.load(fin)) | |
else: | |
out['embeddings'].extend(pickle.load(fin)) | |
except EOFError as e: | |
break | |
N += 1 | |
out['embeddings'] = np.array(out['embeddings']) | |
return out | |
def fill_zeros(loaded_embedding): | |
""" | |
Fill out zeros in the full-text embedding where full-text is missing | |
Parameters | |
---------- | |
loaded_embedding : dict | |
dict as saved from with `load_embeddings` with 2 headers | |
of the list of the metadata_index each embedding vector corresponds | |
to, the list of all article ids | |
Returns | |
------- | |
embeddings : array_like | |
vector embeddings of shape (number of articles, embedding dimension) | |
""" | |
md_index = loaded_embedding['headers'][0] | |
all_ids = loaded_embedding['headers'][1] | |
vectors = loaded_embedding['embeddings'] | |
output = np.zeros((len(all_ids), vectors.shape[1])) | |
for idx, v in zip(md_index, vectors): | |
output[idx,:] = v | |
return output | |