pdf-toweb

Sleeping

App Files Files Community

pdf-toweb / utils.py

Corran

Create utils.py

a917763 verified 5 months ago

raw

history blame contribute delete

3.28 kB

	from packaging.version import Version, InvalidVersion
	from bs4 import NavigableString,Tag
	import requests
	import xml.etree.ElementTree as ET

	def Normalize_Section(section_number):
	try:
	# Parse the version string
	version = Version(section_number)
	# Return the normalized version string
	return str(version)
	except InvalidVersion:
	# Handle invalid version strings if necessary
	return ""


	def Get_Bibliography(article):
	bibliography = {}

	# Iterate over each biblStruct element
	for entry in article.find_all('biblStruct')[1:]:
	xml_id = entry.get('xml:id')

	# Extract relevant details
	title = entry.find('title').text if entry.find('title') else None
	authors = [author.persName for author in entry.find_all('author')]
	authors = [" ".join([a.text for a in author.find_all()]) for author in authors]
	journal = entry.monogr.find('title').text if entry.monogr.find('title') else None
	volume = entry.find('biblScope', {'unit': 'volume'}).text if entry.find('biblScope', {'unit': 'volume'}) else None
	issue = entry.find('biblScope', {'unit': 'issue'}).text if entry.find('biblScope', {'unit': 'issue'}) else None
	pages = entry.find('biblScope', {'unit': 'page'}).text if entry.find('biblScope', {'unit': 'page'}) else None
	year = entry.imprint.date.get('when') if entry.imprint.find('date') else None
	doi = entry.find('idno', {'type': 'DOI'}).text if entry.find('idno', {'type': 'DOI'}) else None

	# Store the bibliographic details in the dictionary
	bibliography[xml_id] = {
	'title': title,
	'authors': authors,
	'journal': journal,
	'volume': volume,
	'issue': issue,
	'pages': pages,
	'year': year,
	'doi': doi
	}

	return bibliography

	def GParse_Header(pdf):

	files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})}
	data = {}
	data['generateIDs']=1
	data['consolidateHeader']=0
	data['segmentSentences']=1
	data["teiCoordinates"]=["head","s","p"]

	headers = {"Accept":"application/xml, text/xml, /; q=0.01"}
	r = requests.request(
	"POST",
	"https://kaiserml-grobid.hf.space/api/processHeaderDocument/",
	headers=headers,
	params=None,
	files=files,
	data=data,
	timeout=60,
	)
	return r.text

	def GParse_Paper(pdf):

	files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})}
	data = {}
	data['generateIDs']=1
	#data['segmentSentences']=1
	data["teiCoordinates"]=["head"]
	#data["teiCoordinates"]=["head","s","p","figure","formula","note","title"]

	headers = {"Accept":"application/xml, text/xml"}
	r = requests.request(
	"POST",
	"https://Kaiserml-grobid.hf.space/api/processFulltextDocument/",
	headers=headers,
	params=None,
	files=files,
	data=data,
	timeout=60,
	)
	return r.text


	def Resolve_GHeader(xml):
	# Parse the XML
	root = ET.fromstring(xml)

	# Define the namespace map
	ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

	# Find the title element using the namespace
	title = root.find('.//tei:title', namespaces=ns).text

	options = requests.get(f"https://api.openalex.org/autocomplete/works?q={title}").json()['results']
	return options