Spaces:

Adr740
/

accounting-micro-automation

Sleeping

App Files Files Community

Adr740 commited on Jul 26, 2024

Commit

411ca77

•

1 Parent(s): c7a0bde

Upload 8 files

Browse files

Files changed (8) hide show

ai_transcriber.py +73 -0
main.py +40 -0
pdfparser_hq.py +108 -0
preprocessing.py +64 -0
reconciliate_and_upload.py +106 -0
requirements.txt +20 -0
script_pdf_to_img.py +4 -0
uploader.py +38 -0

ai_transcriber.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# input : data_path
+import os
+from pdfparser_hq import encode_image
+from config import openai_api
+from openai import OpenAI
+def transcribe_all(data_path,
+    name_of_raw_transcripts = "transcript_raw.txt"):
+    imgs = []
+    client = OpenAI(api_key=openai_api)
+    transcripts = []
+    system_prompt = """
+    You will be given a reciept that could be handwritten or properly formated. Your goal is to transcribe what is written in JSON following this format:
+    {
+    "name_of_supplier" : X,
+    "amount" : X,
+    "currency": X,
+    "date" : DD/MM/YYYY
+    }
+    Make sure you provide the total amount and the correct dates, handwritten ones might be tricky. This will be used to reconcile with banking transactions.
+    """
+    for root, dirs, files in os.walk(data_path):
+        for file in files:
+            if file.endswith('.png'):
+                print(os.path.join(root, file))
+                imgs.append({"path": os.path.join(root, file)})
+                pass
+    for i, img in enumerate(imgs):
+        filename = img["path"]
+        base64_image = encode_image(img["path"])
+        response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+            "role": "system",
+            "content": [
+                {
+                "type": "text",
+                "text": system_prompt
+                }
+            ]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{base64_image}",
+                }
+                }
+            ]
+            }
+        ],
+        temperature=1,
+        max_tokens=1877,
+        top_p=1,
+        response_format={"type": "json_object"},
+        frequency_penalty=0,
+        presence_penalty=0
+        ).choices[0].message.content
+        transcripts.append({"path": filename,"filename" : f"P{i+1}.png", "content": response})
+        print(f"done transcribing transcript: {i+1}/{len(imgs)}")
+    with open(f"{data_path}/{name_of_raw_transcripts}", 'w') as file:
+        file.write(str(transcripts))

main.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#USER INPUT
+from preprocessing import run_preprocessing
+from ai_transcriber import transcribe_all
+from reconciliate_and_upload import reconciliate_and_upload
+import os
+def run_main(
+source_folder_with_reciepts = "https://drive.google.com/drive/folders/1skbgiXMnAe3z2r8E9oLAxXxkDBnrk8l4?usp=sharing",
+link_to_csv = "https://drive.google.com/file/d/1cYoj8U5mttwQu5hNoupifHtjESCIHpsp/view?usp=sharing",
+folder_to_save_processed_reciepts = "https://drive.google.com/drive/folders/1zADJlZ8pvXHNdAhbrxScPynSq1m5Jo1C?usp=sharing",
+folder_to_save_reconciled_data =  "https://drive.google.com/drive/folders/1bmrHExKt0x5AJwJsMtwW1Yk6Hof4WbCF?usp=drive_link",
+name_output_file = "[AI generated] June 2024.xlsx",
+transaction_csv_path = 'downloaded_file.csv',
+data_path = "trial2"
+):
+    os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
+    # breakpoint()
+    source_folder_with_reciepts = source_folder_with_reciepts.split("?")[0].split("/")[-1]
+    folder_to_save_processed_reciepts = folder_to_save_processed_reciepts.split("?")[0].split("/")[-1]
+    folder_to_save_reconciled_data = folder_to_save_reconciled_data.split("?")[0].split("/")[-1]
+    link_to_csv = link_to_csv.split("/view?")[0].split("/")[-1]
+    print("Extracted link csv id: ", link_to_csv)
+    name_output_file = name_output_file + ".xlsx"
+    name_output_file = name_output_file.replace(".xlsx.xlsx", ".xlsx")
+    # breakpoint()
+    run_preprocessing(data_path, source_folder_with_reciepts, link_to_csv)
+    print("Done pre-processing!")
+    transcribe_all(data_path)
+    print("Done transcription!")
+    id_output = reconciliate_and_upload(
+                            data_path,
+                            name_of_csv=transaction_csv_path,
+                            folder_to_save_processed_reciepts=folder_to_save_processed_reciepts,
+                            folder_to_save_reconciled_data = folder_to_save_reconciled_data,
+                            name_of_output=name_output_file)
+    url_output_file = "https://drive.google.com/file/d/" + str(id_output)
+    print("Done all!")
+    return url_output_file

pdfparser_hq.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from openai import OpenAI
+import pdf2image
+import base64
+import os
+import time
+from config import openai_api
+client = OpenAI(api_key=openai_api)
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def pdf_to_image(path_to_pdf, get_output_in_code = False):
+    paths_to_img = []
+    print("Converting pdf to img")
+    start_time = time.time()
+    images = pdf2image.convert_from_path(path_to_pdf, dpi=100)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print("Conversion complete")
+    print("Execution time: {:.2f} seconds".format(execution_time))
+    os.makedirs(path_to_pdf.replace(".pdf", ""), exist_ok=True)
+    save_path = path_to_pdf.replace(".pdf", "") + "/png/"
+    print("Creating repository to store images")
+    os.makedirs(save_path, exist_ok=True)
+    print("Directory created : ", save_path)
+    for i, image in enumerate(images):
+        print(f"saving page {i} in {save_path}/{i}_page.png")
+        image.save(f"{save_path}{i}_page.png", "PNG")
+        paths_to_img.append(f"{save_path}/{i}_page.png")
+    if get_output_in_code:
+        return images, paths_to_img
+def pdfs_folder_to_images(input_path, get_output_in_code = False):
+    pdf_files = []
+    images = {}
+    for root, dirs, files in os.walk(input_path):
+        for file in files:
+            if file.endswith('.pdf'):
+                print("FILE IS ", os.path.join(root, file))
+                pdf_files.append(os.path.join(root, file))
+                if get_output_in_code:
+                    images[os.path.join(root, file)] = pdf_to_image(os.path.join(root, file), get_output_in_code=True)
+                else:
+                    pdf_to_image(os.path.join(root, file))
+    if get_output_in_code:
+        return images
+def img_to_txt(img):
+    response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {
+        "role": "system",
+        "content": "Your task is to transcribe and explain in English every single thing from screenshots sent by users"
+        },
+        {
+        "role": "user",
+        "content": [
+            {
+            "type": "image_url",
+            "image_url": {
+            "url": f"data:image/jpeg;base64,{img}",
+            }
+            }
+        ]
+        }
+    ],
+    temperature=1,
+    max_tokens=1999,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0
+    ).choices[0].message.content
+    return response
+def img_to_txt_gemini(img):
+    return ""
+def process_pdf_hq(path, get_output_in_code=True):
+    converted_pdf_router = pdfs_folder_to_images(path, get_output_in_code=True)
+    path_extracted_pdf = path+"/extracted_pdf/"
+    os.makedirs(path_extracted_pdf, exist_ok=True)
+    # paths_to_img
+    content_extracted = {}
+    for link in list(converted_pdf_router.keys()):
+        print("Working on ", link)
+        content_extracted[link] = []
+        for img_path in converted_pdf_router[link][1]:
+            print("Processing subimage")
+            base64_image = encode_image(img_path)
+            content = img_to_txt(base64_image)
+            # content = "Blank"
+            print(img_path)
+            content_extracted[link].append(content)
+            with open(f"{path_extracted_pdf}/PDF_FILE_{img_path.replace('/','_').replace('.','_')}.txt", "w") as fil:
+                fil.write(content)
+    if get_output_in_code:
+        return content_extracted

preprocessing.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# all in python script obv
+import os
+from pdfparser_hq import pdfs_folder_to_images
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+import io
+from googleapiclient.http import MediaIoBaseDownload
+import gdown
+def list_files_in_folder(service, folder_id):
+    results = service.files().list(
+        q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')",
+        pageSize=1000,
+        fields="nextPageToken, files(id, name)"
+    ).execute()
+    items = results.get('files', [])
+    return items
+def download_file(service, file_id, file_name, save_path):
+    request = service.files().get_media(fileId=file_id)
+    fh = io.FileIO(os.path.join(save_path, file_name), 'wb')
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    while done is False:
+        status, done = downloader.next_chunk()
+        print(f"Download {file_name} {int(status.progress() * 100)}%.")
+def download_files_from_folder(service, folder_id, save_path):
+    files = list_files_in_folder(service, folder_id)
+    for file in files:
+        # print(file)
+        download_file(service, file['id'], file['name'], save_path)
+def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ):
+    """_summary_
+    Args:
+        data_path (_type_): path where to save data
+        source_folder_with_reciepts (_type_): folder_if where the reciepts are saved
+        link_to_csv (_type_): link gdrive to csv
+    """
+    breakpoint()
+    os.makedirs(data_path, exist_ok=True)
+    full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}'
+    print(full_link_to_csv)
+    transaction_csv_path = f'{data_path}/downloaded_file.csv'
+    breakpoint()
+    gdown.download(full_link_to_csv, transaction_csv_path, quiet=False)
+    SCOPES = ['https://www.googleapis.com/auth/drive']
+    SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
+    credentials = service_account.Credentials.from_service_account_file(
+        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
+    service = build('drive', 'v3', credentials=credentials)
+    download_files_from_folder(service, source_folder_with_reciepts, data_path)
+    pdfs_folder_to_images(data_path)

reconciliate_and_upload.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from uploader import save_logs
+import os
+import pandas as pd
+from rapidfuzz import process, fuzz
+from random import randint
+from time import sleep
+#other imports (google API)
+def fuzzy_match(row, choices, scorer, cutoff):
+    match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
+    if match:
+        return match[0]
+    return "missing receipt"
+def reconciliate_and_upload(data_path,
+    name_of_csv,
+    folder_to_save_processed_reciepts,
+    folder_to_save_reconciled_data,
+    name_of_raw_transcripts = "transcript_raw.txt",
+    name_of_output = "[AI Generated] Output.xlsx" ):
+    with open(f"{data_path}/{name_of_raw_transcripts}") as file:
+        transcripts = eval(file.read())
+    imgs = []
+    path_to_pdfs =data_path
+    for root, dirs, files in os.walk(path_to_pdfs):
+        for file in files:
+            if file.endswith('.png'):
+                print(os.path.join(root, file))
+                imgs.append({"path": os.path.join(root, file)})
+                pass
+    list_transcripts_evaled = []
+    objects = []
+    for i,t in enumerate(transcripts):
+        content = eval(t["content"].replace('null', '-1'))
+        try:
+            obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
+        except:
+            print("sleeping a bit innit")
+            breakpoint()
+            sleep(randint(30,40))
+            obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
+        objects.append(obk)
+        print("uploaded image!")
+        try:
+            list_transcripts_evaled.append({
+                "path": imgs[i]["path"],
+                "name_of_supplier" :content["name_of_supplier"],
+                                            "amount":content["amount"],
+                                            "currency":content["currency"],
+                                            "date": content["date"]})
+        except:
+            breakpoint()
+    urls = []
+    for ob in objects:
+        url = "https://drive.google.com/file/d/" + ob["id"]
+        urls.append(url)
+    df_app = pd.DataFrame(list_transcripts_evaled)
+    df_app["amount"] = df_app["amount"].astype(str).str.replace(" ", "").str.replace(",", ".").str.replace("N/A", "-1").astype("float")
+    df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
+    df_app["url"] = urls
+    df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
+    df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
+    df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
+    df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
+    df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
+    merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
+    merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
+    # Merge on amount (Débit and amount)
+    merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
+    # Apply fuzzy matching
+    raw_choices = df_app['name_of_supplier'].tolist()
+    choices = []
+    for r in raw_choices:
+        choices.append(r.upper())
+    merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
+    merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
+    # Identify residuals in df that were not matched
+    df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
+    # Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
+    merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
+    # merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
+    merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
+    df_residuals_app.drop(columns=["path"], inplace=True)
+    merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
+    df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
+    with pd.ExcelWriter(name_of_output) as writer:
+        merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
+        df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
+    id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
+    return id_output

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+pandas
+numpy
+tabula-py
+openai
+gradio
+pyPDF2
+marker-pdf
+groq
+bs4
+nltk
+tiktoken
+pdf2image
+gdown
+google-generativeai
+google-auth
+google-auth-oauthlib
+google-auth-httplib2
+google-api-python-client
+chromadb
+rapidfuzz

script_pdf_to_img.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
+from pdfparser_hq import pdfs_folder_to_images
+pdfs_folder_to_images(f"data")

uploader.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+from datetime import datetime
+def save_logs(path_to_data_to_save, name_to_save, folder_id = "16Vv728HPW2J0BYzgTaBV00nUEc5pRKT-"):
+    filename = path_to_data_to_save
+    SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
+    SCOPES = ['https://www.googleapis.com/auth/drive.file']
+    credentials = service_account.Credentials.from_service_account_file(
+        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
+    service = build('drive', 'v3', credentials=credentials)
+    file_metadata = {
+        'name': name_to_save,  # Name of the file to be uploaded
+        'parents': [folder_id]  # Folder ID
+    }
+    file_path = filename
+    # Create a MediaFileUpload object to upload the file
+    media = MediaFileUpload(file_path)
+    file = service.files().create(
+        body=file_metadata,
+        media_body=media,
+        fields='id'
+    ).execute()
+    # Print the file ID of the uploaded file
+    print('Saved in Google Drive - File ID: %s' % file.get('id'))
+    return file