skylord commited on
Commit
190c508
·
verified ·
1 Parent(s): da9cc2f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pandas as pd
4
+ from azure.ai.formrecognizer import DocumentAnalysisClient
5
+ from azure.core.credentials import AzureKeyCredential
6
+ from PyPDF2 import PdfReader, PdfWriter
7
+ from io import BytesIO
8
+
9
+ YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"]
10
+ YOUR_KEY = os.environ["YOUR_KEY"]
11
+
12
+ st.set_page_config(
13
+ page_title="PDF Table Extractor",
14
+ layout="centered",
15
+ initial_sidebar_state="auto"
16
+ )
17
+
18
+ document_analysis_client = DocumentAnalysisClient(
19
+ endpoint=YOUR_ENDPOINT,
20
+ credential=AzureKeyCredential(YOUR_KEY)
21
+ )
22
+
23
+ # Function to convert table cells to pandas DataFrame
24
+
25
+
26
+ def table2pandas(table):
27
+ data = []
28
+ for cell in table.cells:
29
+ while len(data) <= cell.row_index:
30
+ data.append([])
31
+ while len(data[cell.row_index]) <= cell.column_index:
32
+ data[cell.row_index].append("")
33
+ data[cell.row_index][cell.column_index] = cell.content
34
+ return pd.DataFrame(data)
35
+
36
+ # Function to split PDF into pages
37
+
38
+
39
+ def split_pdf_to_pages(filepath):
40
+ input_pdf = PdfReader(filepath)
41
+ pages = []
42
+ for page_num in range(len(input_pdf.pages)):
43
+ output_pdf = PdfWriter()
44
+ output_pdf.add_page(input_pdf.pages[page_num])
45
+ page_stream = BytesIO()
46
+ output_pdf.write(page_stream)
47
+ page_stream.seek(0)
48
+ pages.append(page_stream.read())
49
+ return pages
50
+
51
+ # Streamlit app
52
+
53
+
54
+ def main():
55
+ st.title("PDF Table Extractor")
56
+
57
+ # Upload PDF file
58
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
59
+
60
+ if uploaded_file is not None:
61
+ # Temporarily save uploaded PDF
62
+ os.makedirs("temp_files", exist_ok=True)
63
+ temp_filepath = os.path.join("temp_files", uploaded_file.name)
64
+ with open(temp_filepath, "wb") as f:
65
+ f.write(uploaded_file.getbuffer())
66
+
67
+ st.text("Uploaded successfully. Extracting tables...")
68
+
69
+ # Process the uploaded PDF
70
+ pages = split_pdf_to_pages(temp_filepath)
71
+ for page_num, page_bytes in enumerate(pages):
72
+ poller = document_analysis_client.begin_analyze_document(
73
+ "prebuilt-layout", document=page_bytes)
74
+ result = poller.result()
75
+
76
+ if hasattr(result, 'tables') and result.tables:
77
+ for table_num, table in enumerate(result.tables):
78
+ table_df = table2pandas(table)
79
+ st.write(table_df) # Display table in Streamlit (optional)
80
+
81
+ # Provide a download link for the CSV file
82
+ csv_file = table_df.to_csv(index=False).encode('utf-8')
83
+ st.download_button(
84
+ label="Download CSV",
85
+ data=csv_file,
86
+ file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv",
87
+ mime="text/csv"
88
+ )
89
+
90
+ st.success("Tables extracted and saved successfully!")
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()