# Arxiv_tab.py # Description: This file contains the Gradio UI for searching, browsing, and ingesting arXiv papers. # # Imports import tempfile from datetime import datetime import requests from App_Function_Libraries.PDF.PDF_Ingestion_Lib import extract_text_and_format_from_pdf # # Local Imports from App_Function_Libraries.Third_Party.Arxiv import convert_xml_to_markdown, fetch_arxiv_xml, parse_arxiv_feed, \ build_query_url, ARXIV_PAGE_SIZE, fetch_arxiv_pdf_url from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords # import gradio as gr # ##################################################################################################### # # Functions: def create_arxiv_tab(): with gr.TabItem("Arxiv Search & Ingest", visible=True): gr.Markdown("# arXiv Search, Browse, Download, and Ingest") gr.Markdown("#### Thank you to arXiv for use of its open access interoperability.") with gr.Row(): with gr.Column(scale=1): # Search Inputs with gr.Row(): with gr.Column(): search_query = gr.Textbox(label="Search Query", placeholder="e.g., machine learning") author_filter = gr.Textbox(label="Author", placeholder="e.g., John Doe") year_filter = gr.Number(label="Year", precision=0) search_button = gr.Button("Search") with gr.Column(scale=2): # Pagination Controls paper_selector = gr.Radio(label="Select a Paper", choices=[], interactive=True) prev_button = gr.Button("Previous Page") next_button = gr.Button("Next Page") page_info = gr.Textbox(label="Page", value="1", interactive=False) # Ingestion Section with gr.Row(): with gr.Column(): # Paper Details View paper_view = gr.Markdown(label="Paper Details") arxiv_keywords = gr.Textbox(label="Additional Keywords (comma-separated)", placeholder="e.g., AI, Deep Learning") ingest_button = gr.Button("Ingest Selected Paper") ingest_result = gr.Textbox(label="Ingestion Result", interactive=False) # Define States for Pagination and Selection state = gr.State(value={"start": 0, "current_page": 1, "last_query": None, "entries": []}) selected_paper_id = gr.State(value=None) def search_arxiv(query, author, year): start = 0 url = build_query_url(query, author, year, start) try: response = requests.get(url) response.raise_for_status() except requests.exceptions.RequestException as e: return gr.update(value=[]), gr.update(value=f"**Error:** {str(e)}"), state.value entries = parse_arxiv_feed(response.text) state.value = {"start": start, "current_page": 1, "last_query": (query, author, year), "entries": entries} if not entries: return gr.update(value=[]), "No results found.", state.value # Update the dropdown with paper titles for selection titles = [entry['title'] for entry in entries] return gr.update(choices=titles), "1", state.value # Dead code? FIXME def handle_pagination(direction): current_state = state.value query, author, year = current_state["last_query"] new_page = current_state["current_page"] + direction if new_page < 1: new_page = 1 start = (new_page - 1) * ARXIV_PAGE_SIZE url = build_query_url(query, author, year, start) try: response = requests.get(url) response.raise_for_status() except requests.exceptions.RequestException as e: return gr.update(), gr.update() entries = parse_arxiv_feed(response.text) if entries: current_state["start"] = start current_state["current_page"] = new_page current_state["entries"] = entries state.value = current_state # Update the dropdown with paper titles for the new page titles = [entry['title'] for entry in entries] return gr.update(choices=titles), str(new_page) else: # If no entries, do not change the page return gr.update(), gr.update() def load_selected_paper(selected_title): if not selected_title: return "Please select a paper to view." # Find the selected paper from state for entry in state.value["entries"]: if entry['title'] == selected_title: paper_id = entry['id'] break else: return "Paper not found." try: # Fetch the PDF URL and download the full-text pdf_url = fetch_arxiv_pdf_url(paper_id) response = requests.get(pdf_url) response.raise_for_status() # Save the PDF temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(response.content) temp_pdf_path = temp_pdf.name # Convert PDF to markdown using your PDF ingestion function full_text_markdown = extract_text_and_format_from_pdf(temp_pdf_path) selected_paper_id.value = paper_id return full_text_markdown except Exception as e: return f"Error loading full paper: {str(e)}" def process_and_ingest_arxiv_paper(paper_id, additional_keywords): try: if not paper_id: return "Please select a paper to ingest." # Fetch the PDF URL pdf_url = fetch_arxiv_pdf_url(paper_id) # Download the PDF response = requests.get(pdf_url) response.raise_for_status() # Save the PDF temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(response.content) temp_pdf_path = temp_pdf.name # Convert PDF to markdown using your PDF ingestion function markdown_text = extract_text_and_format_from_pdf(temp_pdf_path) # Fetch metadata from arXiv to get title, authors, and categories xml_content = fetch_arxiv_xml(paper_id) _, title, authors, categories = convert_xml_to_markdown(xml_content) # Prepare the arXiv paper URL for access/download paper_url = f"https://arxiv.org/abs/{paper_id}" # Prepare the keywords for ingestion keywords = f"arxiv,{','.join(categories)}" if additional_keywords: keywords += f",{additional_keywords}" # Ingest full paper markdown content add_media_with_keywords( url=paper_url, title=title, media_type='document', content=markdown_text, # Full paper content in markdown keywords=keywords, prompt='No prompt for arXiv papers', summary='Full arXiv paper ingested from PDF', transcription_model='None', author=', '.join(authors), ingestion_date=datetime.now().strftime('%Y-%m-%d') ) # Return success message with paper title and authors return f"arXiv paper '{title}' by {', '.join(authors)} ingested successfully." except Exception as e: # Return error message if anything goes wrong return f"Error processing arXiv paper: {str(e)}" # Event Handlers # Connect Search Button search_button.click( fn=search_arxiv, inputs=[search_query, author_filter, year_filter], outputs=[paper_selector, page_info, state], queue=True ) # Connect Next Button next_button.click( fn=lambda: handle_pagination(1), inputs=None, outputs=[paper_selector, page_info], queue=True ) # Connect Previous Button prev_button.click( fn=lambda: handle_pagination(-1), inputs=None, outputs=[paper_selector, page_info], queue=True ) # When the user selects a paper in the Dropdown paper_selector.change( fn=load_selected_paper, inputs=paper_selector, outputs=paper_view, queue=True ) # Connect Ingest Button ingest_button.click( fn=process_and_ingest_arxiv_paper, inputs=[selected_paper_id, arxiv_keywords], outputs=ingest_result, queue=True ) # # End of File #####################################################################################################