invoice-extractor / pdf_reader.py
KonstantinosKakkavas's picture
Upload 29 files
1108a3a verified
raw
history blame contribute delete
No virus
2.23 kB
# import pandas as pd
# import json
#
#
# # Function to parse the LLM output and create a DataFrame
# def process_llm_data(llm_data):
# # Assuming llm_data is a string that exactly matches the dictionary format
# # Safely convert string representation of dictionary to a dictionary
# data_dict = eval(llm_data.strip())
#
# # Convert the dictionary to a DataFrame
# # Since our example has comma-separated values in strings for some fields, let's handle that
# for key, value in data_dict.items():
# if ',' in value:
# data_dict[key] = [v.strip() for v in value.split(',')]
# else:
# data_dict[key] = [value]
#
# df = pd.DataFrame(data_dict)
#
# # Saving the DataFrame to a CSV file
# csv_file_path = "Invoice_Data.csv"
# df.to_csv(csv_file_path, index=False)
#
# return csv_file_path
#
#
# # Example usage
# llm_extracted_data = """
# {
# 'Invoice no.': '001',
# 'Description': 'Widget A, Widget B',
# 'Quantity': '2, 5',
# 'Date': '2024-04-13',
# 'Unit price': '$30.00, $20.00',
# 'Amount': '$60.00, $100.00',
# 'Total': '$160.00',
# 'Email': 'info@gptwidgets.com',
# 'Phone number': '(123) 456-7890',
# 'Address': '123 AI Lane, Model Town, OpenAI'
# }
# """
#
# # Call the function to process the data and create a CSV
# csv_path = process_llm_data(llm_extracted_data)
# print(f"Data saved to CSV at: {csv_path}")
import pandas as pd
from utils import get_pdf_text, extracted_data
from dotenv import load_dotenv
load_dotenv()
def save_data_to_csv(data, filename):
# Convert the dictionary to a DataFrame
df = pd.DataFrame([data])
print(df)
# Save the DataFrame to a CSV file
csv_path = f"{filename}.csv"
df.to_csv(csv_path, index=False)
print(f"Data saved to {csv_path}")
# Example usage
print('filename is {}'.format("Invoice_001.pdf"))
raw_data = get_pdf_text("Invoice_001.pdf")
print('raw_data is {}'.format(raw_data))
llm_extracted_data = extracted_data(raw_data)
print('llm_extracted_data is {}'.format(llm_extracted_data))
# Assuming llm_extracted_data is a dictionary containing the extracted fields
save_data_to_csv(llm_extracted_data, "Extracted_Invoice1")