Spaces:

rbiswasfc
/

arxiv-extract-from-pdf

Sleeping

App Files Files Community

rbiswasfc commited on Sep 3, 2024

Commit

2a910d7

1 Parent(s): 14e2e93

zero gpu

Browse files

Files changed (2) hide show

app.py +61 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import gradio as gr
+import requests
+import spaces
+from marker.convert import convert_single_pdf
+from marker.logger import configure_logging
+from marker.models import load_all_models
+configure_logging()
+MARKER_MODEL_LST = load_all_models()
+@spaces.GPU
+def extract_from_pdf(arxiv_id):
+    """extract text from a PDF file"""
+    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+    tmp_pdf = ".tmp_pdf"
+    response = requests.get(pdf_url)
+    if response.status_code == 200:
+        with open(tmp_pdf, "wb") as file:
+            file.write(response.content)
+        print("PDF downloaded and saved as ", tmp_pdf)
+    else:
+        print(f"Failed to download PDF. Status code: {response.status_code}")
+    full_text, doc_images, out_meta = convert_single_pdf(
+        tmp_pdf, MARKER_MODEL_LST, max_pages=20
+    )
+    os.remove(tmp_pdf)
+    print("Temporary PDF file removed.")
+    return full_text
+def extract(arxiv_id):
+    if not arxiv_id:
+        return {"error": "ArXiv ID is required"}
+    try:
+        full_text = extract_from_pdf(arxiv_id)
+        results = {"arxiv_id": arxiv_id, "text": full_text}
+        return results
+    except Exception as e:
+        return {"error": str(e)}
+with gr.Blocks() as app:
+    # Create an input text box
+    text_input = gr.Textbox(label="Enter arxiv id")
+    # Create an output text component
+    output = gr.JSON(label="Extracted text")
+    # When the input text is submitted, call the embedding function and display the output
+    text_input.submit(extract, inputs=text_input, outputs=output)
+if __name__ == "__main__":
+    app.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi
+srsly
+python-dotenv
+transformers
+torch
+beautifulsoup4
+marker-pdf
+retry