arXiv2Latex / backend.py
yiyixin's picture
upload
6be34e2
import tarfile
import os
import requests
import datetime
import pandas as pd
import shutil
from bs4 import BeautifulSoup
from tqdm import tqdm
import base64
def ToBase64(file):
with open(file, 'rb') as fileObj:
data = fileObj.read()
base64_data = base64.b64encode(data)
return base64_data
def archive_dir(dir_name,output_filename,format="zip"):
shutil.make_archive(output_filename, format, dir_name)
return output_filename+".zip"
def make_dir_if_not_exist(folder):
if not os.path.exists(folder):
os.makedirs(folder)
def untar(fname, dirs):
"""
解压tar.gz文件
:param fname: 压缩文件名
:param dirs: 解压后的存放路径
:return: bool
"""
try:
t = tarfile.open(fname)
t.extractall(path = dirs)
return True
except Exception as e:
print(e)
return False
def get_timestamp():
ts = pd.to_datetime(str(datetime.datetime.now()))
d = ts.strftime('%Y%m%d%H%M%S')
return d
def get_name_from_arvix(url):
res = BeautifulSoup(requests.get(url).content, 'lxml').find("h1",attrs={"class":"title mathjax"})
if res is None:
return ''
title = res.text[6:].replace(" ","-")
return title
def download_source(pdf_lists=None,output_base=None,project_name=None,fetch_title=True, return_source=False):
base=output_base
project_name = project_name + get_timestamp()
base = os.path.join(base,project_name)
make_dir_if_not_exist(base)
for pdf_link in tqdm(pdf_lists):
file_stamp = pdf_link.split("/")[-1]
if fetch_title:
title = get_name_from_arvix(pdf_link)
if len(title )== 0:
continue
else:
import numpy as np
title = file_stamp
source_link = "https://arxiv.org/e-print/"+file_stamp
inp = os.path.join(base,'input')
make_dir_if_not_exist(inp)
out = os.path.join(base,'output')
make_dir_if_not_exist(out)
if return_source:
print(source_link)
continue
response = requests.get(source_link)
filename = file_stamp+".tar.gz"
filepath = os.path.join(inp,filename)
open(filepath, "wb").write(response.content)
outpath = os.path.join(out,title)
untar(filepath,outpath)
archive_dir(out,os.path.join(base,project_name))
if __name__ == '__main__':
s = get_timestamp()
print(s)