Spaces:

sidphbot
/

Researcher

Build error

App Files Files Community

Researcher / arxiv_public_data /embeddings /util.py

sidphbot

spaces init

a8d4e3d over 2 years ago

raw

history blame

4.91 kB

	"""
	util.py

	author: Colin Clement
	date: 2019-04-05

	This module contains helper functions for loading embeddings and batch
	loading the full text, since many computers cannot contain the whole
	fulltext in memory.
	"""

	import os
	import re
	import numpy as np
	import pickle

	from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT
	from arxiv_public_data.oai_metadata import load_metadata

	def id_to_pathname(aid):
	"""
	Make filename path for text document, matching the format of fulltext
	creation in `s3_bulk_download`
	Parameters
	----------
	aid : str
	string of arXiv article id as found in metadata
	Returns
	-------
	pathname : str
	pathname in which to store the article following
	Examples
	--------
	>>> id_to_pathname('hep-ph/0001001') #doctest: +ELLIPSIS
	'.../hep-ph/0001/hep-ph0001001.txt'

	>>> id_to_pathname('1501.13851') #doctest: +ELLIPSIS
	'.../arxiv/1501/1501.13851.txt'
	"""
	if '.' in aid: # new style ArXiv ID
	yymm = aid.split('.')[0]
	return os.path.join(DIR_FULLTEXT, 'arxiv', yymm, aid + '.txt')

	# old style ArXiv ID
	cat, arxiv_id = re.split(r'(\d+)', aid)[:2]
	yymm = arxiv_id[:4]
	return os.path.join(DIR_FULLTEXT, cat, yymm, aid.replace('/', '') + '.txt')

	def load_generator(paths, batchsize):
	"""
	Creates a generator object for batch loading files from paths
	Parameters
	----------
	paths : list of filepaths
	batchsize : int
	Returns
	-------
	file_contents : list of strings of contents of files in path
	"""
	assert type(paths) is list, 'Requires a list of paths'
	assert type(batchsize) is int, 'batchsize must be an int'
	assert batchsize > 0, 'batchsize must be positive'

	out = []
	for p in paths:
	with open(p, 'r') as fin:
	out.append(fin.read())
	if len(out) == batchsize:
	yield np.array(out, dtype='object')
	out = []
	yield out

	def batch_fulltext(batchsize=32, maxnum=None):
	"""
	Read metadata and find corresponding files in the fulltext
	Parameters
	----------
	(optional)
	batchsize : int
	number of fulltext files to load into a batch
	maxnum : int
	the maximum number of paths to feed the generator, for
	testing purposes
	Returns
	-------
	md_index, all_ids, load_gen : tuple of (list, list, generator)
	md_index is a mapping of existing fulltext files, in order
	of their appearance, and containing the index of corresponding
	metadata. all_ids is a list of all arXiv IDs in the metadata.
	load_gen is a generator which allows batched loading of the
	full-text, as defined by `load_generator`
	"""
	all_ids = [m['id'] for m in load_metadata()]
	all_paths = [id_to_pathname(aid) for aid in all_ids]
	exists = [os.path.exists(p) for p in all_paths]
	existing_paths = [p for p, e in zip(all_paths, exists) if e][:maxnum]
	md_index = [i for i, e in enumerate(exists) if e]
	return md_index, all_ids, load_generator(existing_paths, batchsize)

	def load_embeddings(filename, headers=0):
	"""
	Loads vector embeddings
	Parameters
	----------
	filename : str
	path to vector embeddings saved by `create_save_embeddings`
	(optional)
	headers : int
	number of pickle calls containing metadata separate from the graphs
	Returns
	-------
	embeddings : dict
	keys 'embeddings' containing vector embeddings and
	'headers' containining metadata
	"""
	out = {'embeddings': [], 'headers': []}
	N = 0
	with open(filename, 'rb') as fin:
	while True:
	try:
	if N < headers:
	out['headers'].append(pickle.load(fin))
	else:
	out['embeddings'].extend(pickle.load(fin))
	except EOFError as e:
	break
	N += 1
	out['embeddings'] = np.array(out['embeddings'])
	return out

	def fill_zeros(loaded_embedding):
	"""
	Fill out zeros in the full-text embedding where full-text is missing
	Parameters
	----------
	loaded_embedding : dict
	dict as saved from with `load_embeddings` with 2 headers
	of the list of the metadata_index each embedding vector corresponds
	to, the list of all article ids
	Returns
	-------
	embeddings : array_like
	vector embeddings of shape (number of articles, embedding dimension)
	"""
	md_index = loaded_embedding['headers'][0]
	all_ids = loaded_embedding['headers'][1]
	vectors = loaded_embedding['embeddings']
	output = np.zeros((len(all_ids), vectors.shape[1]))
	for idx, v in zip(md_index, vectors):
	output[idx,:] = v
	return output