Spaces:

oceansweep
/

tldw

Running

App Files Files Community

tldw / App_Function_Libraries /Gradio_UI /Arxiv_tab.py

oceansweep

Upload 155 files

43cd37c verified 2 months ago

raw

history blame

9.69 kB

	# Arxiv_tab.py
	# Description: This file contains the Gradio UI for searching, browsing, and ingesting arXiv papers.
	#
	# Imports
	import tempfile
	from datetime import datetime
	import requests

	from App_Function_Libraries.PDF.PDF_Ingestion_Lib import extract_text_and_format_from_pdf
	#
	# Local Imports
	from App_Function_Libraries.Third_Party.Arxiv import convert_xml_to_markdown, fetch_arxiv_xml, parse_arxiv_feed, \
	build_query_url, ARXIV_PAGE_SIZE, fetch_arxiv_pdf_url
	from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
	#
	import gradio as gr
	#
	#####################################################################################################
	#
	# Functions:

	def create_arxiv_tab():
	with gr.TabItem("Arxiv Search & Ingest", visible=True):
	gr.Markdown("# arXiv Search, Browse, Download, and Ingest")
	gr.Markdown("#### Thank you to arXiv for use of its open access interoperability.")
	with gr.Row():
	with gr.Column(scale=1):
	# Search Inputs
	with gr.Row():
	with gr.Column():
	search_query = gr.Textbox(label="Search Query", placeholder="e.g., machine learning")
	author_filter = gr.Textbox(label="Author", placeholder="e.g., John Doe")
	year_filter = gr.Number(label="Year", precision=0)
	search_button = gr.Button("Search")

	with gr.Column(scale=2):
	# Pagination Controls
	paper_selector = gr.Radio(label="Select a Paper", choices=[], interactive=True)
	prev_button = gr.Button("Previous Page")
	next_button = gr.Button("Next Page")
	page_info = gr.Textbox(label="Page", value="1", interactive=False)

	# Ingestion Section
	with gr.Row():
	with gr.Column():
	# Paper Details View
	paper_view = gr.Markdown(label="Paper Details")
	arxiv_keywords = gr.Textbox(label="Additional Keywords (comma-separated)",
	placeholder="e.g., AI, Deep Learning")
	ingest_button = gr.Button("Ingest Selected Paper")
	ingest_result = gr.Textbox(label="Ingestion Result", interactive=False)

	# Define States for Pagination and Selection
	state = gr.State(value={"start": 0, "current_page": 1, "last_query": None, "entries": []})
	selected_paper_id = gr.State(value=None)

	def search_arxiv(query, author, year):
	start = 0
	url = build_query_url(query, author, year, start)
	try:
	response = requests.get(url)
	response.raise_for_status()
	except requests.exceptions.RequestException as e:
	return gr.update(value=[]), gr.update(value=f"Error: {str(e)}"), state.value

	entries = parse_arxiv_feed(response.text)
	state.value = {"start": start, "current_page": 1, "last_query": (query, author, year), "entries": entries}
	if not entries:
	return gr.update(value=[]), "No results found.", state.value

	# Update the dropdown with paper titles for selection
	titles = [entry['title'] for entry in entries]
	return gr.update(choices=titles), "1", state.value

	# Dead code? FIXME
	def handle_pagination(direction):
	current_state = state.value
	query, author, year = current_state["last_query"]
	new_page = current_state["current_page"] + direction
	if new_page < 1:
	new_page = 1
	start = (new_page - 1) * ARXIV_PAGE_SIZE
	url = build_query_url(query, author, year, start)
	try:
	response = requests.get(url)
	response.raise_for_status()
	except requests.exceptions.RequestException as e:
	return gr.update(), gr.update()

	entries = parse_arxiv_feed(response.text)
	if entries:
	current_state["start"] = start
	current_state["current_page"] = new_page
	current_state["entries"] = entries
	state.value = current_state

	# Update the dropdown with paper titles for the new page
	titles = [entry['title'] for entry in entries]
	return gr.update(choices=titles), str(new_page)
	else:
	# If no entries, do not change the page
	return gr.update(), gr.update()

	def load_selected_paper(selected_title):
	if not selected_title:
	return "Please select a paper to view."

	# Find the selected paper from state
	for entry in state.value["entries"]:
	if entry['title'] == selected_title:
	paper_id = entry['id']
	break
	else:
	return "Paper not found."

	try:
	# Fetch the PDF URL and download the full-text
	pdf_url = fetch_arxiv_pdf_url(paper_id)
	response = requests.get(pdf_url)
	response.raise_for_status()

	# Save the PDF temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	temp_pdf.write(response.content)
	temp_pdf_path = temp_pdf.name

	# Convert PDF to markdown using your PDF ingestion function
	full_text_markdown = extract_text_and_format_from_pdf(temp_pdf_path)

	selected_paper_id.value = paper_id
	return full_text_markdown
	except Exception as e:
	return f"Error loading full paper: {str(e)}"

	def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
	try:
	if not paper_id:
	return "Please select a paper to ingest."

	# Fetch the PDF URL
	pdf_url = fetch_arxiv_pdf_url(paper_id)

	# Download the PDF
	response = requests.get(pdf_url)
	response.raise_for_status()

	# Save the PDF temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	temp_pdf.write(response.content)
	temp_pdf_path = temp_pdf.name

	# Convert PDF to markdown using your PDF ingestion function
	markdown_text = extract_text_and_format_from_pdf(temp_pdf_path)

	# Fetch metadata from arXiv to get title, authors, and categories
	xml_content = fetch_arxiv_xml(paper_id)
	_, title, authors, categories = convert_xml_to_markdown(xml_content)

	# Prepare the arXiv paper URL for access/download
	paper_url = f"https://arxiv.org/abs/{paper_id}"

	# Prepare the keywords for ingestion
	keywords = f"arxiv,{','.join(categories)}"
	if additional_keywords:
	keywords += f",{additional_keywords}"

	# Ingest full paper markdown content
	add_media_with_keywords(
	url=paper_url,
	title=title,
	media_type='document',
	content=markdown_text, # Full paper content in markdown
	keywords=keywords,
	prompt='No prompt for arXiv papers',
	summary='Full arXiv paper ingested from PDF',
	transcription_model='None',
	author=', '.join(authors),
	ingestion_date=datetime.now().strftime('%Y-%m-%d')
	)

	# Return success message with paper title and authors
	return f"arXiv paper '{title}' by {', '.join(authors)} ingested successfully."
	except Exception as e:
	# Return error message if anything goes wrong
	return f"Error processing arXiv paper: {str(e)}"

	# Event Handlers
	# Connect Search Button
	search_button.click(
	fn=search_arxiv,
	inputs=[search_query, author_filter, year_filter],
	outputs=[paper_selector, page_info, state],
	queue=True
	)

	# Connect Next Button
	next_button.click(
	fn=lambda: handle_pagination(1),
	inputs=None,
	outputs=[paper_selector, page_info],
	queue=True
	)

	# Connect Previous Button
	prev_button.click(
	fn=lambda: handle_pagination(-1),
	inputs=None,
	outputs=[paper_selector, page_info],
	queue=True
	)

	# When the user selects a paper in the Dropdown
	paper_selector.change(
	fn=load_selected_paper,
	inputs=paper_selector,
	outputs=paper_view,
	queue=True
	)

	# Connect Ingest Button
	ingest_button.click(
	fn=process_and_ingest_arxiv_paper,
	inputs=[selected_paper_id, arxiv_keywords],
	outputs=ingest_result,
	queue=True
	)

	#
	# End of File
	#####################################################################################################