Adr740 commited on
Commit
411ca77
1 Parent(s): c7a0bde

Upload 8 files

Browse files
ai_transcriber.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # input : data_path
2
+ import os
3
+ from pdfparser_hq import encode_image
4
+ from config import openai_api
5
+ from openai import OpenAI
6
+
7
+ def transcribe_all(data_path,
8
+ name_of_raw_transcripts = "transcript_raw.txt"):
9
+ imgs = []
10
+ client = OpenAI(api_key=openai_api)
11
+ transcripts = []
12
+
13
+ system_prompt = """
14
+ You will be given a reciept that could be handwritten or properly formated. Your goal is to transcribe what is written in JSON following this format:
15
+
16
+ {
17
+ "name_of_supplier" : X,
18
+ "amount" : X,
19
+ "currency": X,
20
+ "date" : DD/MM/YYYY
21
+ }
22
+
23
+ Make sure you provide the total amount and the correct dates, handwritten ones might be tricky. This will be used to reconcile with banking transactions.
24
+
25
+ """
26
+
27
+ for root, dirs, files in os.walk(data_path):
28
+ for file in files:
29
+ if file.endswith('.png'):
30
+ print(os.path.join(root, file))
31
+ imgs.append({"path": os.path.join(root, file)})
32
+ pass
33
+
34
+
35
+ for i, img in enumerate(imgs):
36
+ filename = img["path"]
37
+ base64_image = encode_image(img["path"])
38
+
39
+ response = client.chat.completions.create(
40
+ model="gpt-4o",
41
+ messages=[
42
+ {
43
+ "role": "system",
44
+ "content": [
45
+ {
46
+ "type": "text",
47
+ "text": system_prompt
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {
55
+ "type": "image_url",
56
+ "image_url": {
57
+ "url": f"data:image/jpeg;base64,{base64_image}",
58
+ }
59
+ }
60
+ ]
61
+ }
62
+ ],
63
+ temperature=1,
64
+ max_tokens=1877,
65
+ top_p=1,
66
+ response_format={"type": "json_object"},
67
+ frequency_penalty=0,
68
+ presence_penalty=0
69
+ ).choices[0].message.content
70
+ transcripts.append({"path": filename,"filename" : f"P{i+1}.png", "content": response})
71
+ print(f"done transcribing transcript: {i+1}/{len(imgs)}")
72
+ with open(f"{data_path}/{name_of_raw_transcripts}", 'w') as file:
73
+ file.write(str(transcripts))
main.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #USER INPUT
2
+ from preprocessing import run_preprocessing
3
+ from ai_transcriber import transcribe_all
4
+ from reconciliate_and_upload import reconciliate_and_upload
5
+ import os
6
+ def run_main(
7
+ source_folder_with_reciepts = "https://drive.google.com/drive/folders/1skbgiXMnAe3z2r8E9oLAxXxkDBnrk8l4?usp=sharing",
8
+ link_to_csv = "https://drive.google.com/file/d/1cYoj8U5mttwQu5hNoupifHtjESCIHpsp/view?usp=sharing",
9
+ folder_to_save_processed_reciepts = "https://drive.google.com/drive/folders/1zADJlZ8pvXHNdAhbrxScPynSq1m5Jo1C?usp=sharing",
10
+ folder_to_save_reconciled_data = "https://drive.google.com/drive/folders/1bmrHExKt0x5AJwJsMtwW1Yk6Hof4WbCF?usp=drive_link",
11
+ name_output_file = "[AI generated] June 2024.xlsx",
12
+ transaction_csv_path = 'downloaded_file.csv',
13
+ data_path = "trial2"
14
+ ):
15
+ os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
16
+
17
+ # breakpoint()
18
+ source_folder_with_reciepts = source_folder_with_reciepts.split("?")[0].split("/")[-1]
19
+ folder_to_save_processed_reciepts = folder_to_save_processed_reciepts.split("?")[0].split("/")[-1]
20
+ folder_to_save_reconciled_data = folder_to_save_reconciled_data.split("?")[0].split("/")[-1]
21
+ link_to_csv = link_to_csv.split("/view?")[0].split("/")[-1]
22
+ print("Extracted link csv id: ", link_to_csv)
23
+ name_output_file = name_output_file + ".xlsx"
24
+ name_output_file = name_output_file.replace(".xlsx.xlsx", ".xlsx")
25
+ # breakpoint()
26
+ run_preprocessing(data_path, source_folder_with_reciepts, link_to_csv)
27
+ print("Done pre-processing!")
28
+ transcribe_all(data_path)
29
+ print("Done transcription!")
30
+ id_output = reconciliate_and_upload(
31
+ data_path,
32
+ name_of_csv=transaction_csv_path,
33
+ folder_to_save_processed_reciepts=folder_to_save_processed_reciepts,
34
+ folder_to_save_reconciled_data = folder_to_save_reconciled_data,
35
+ name_of_output=name_output_file)
36
+
37
+
38
+ url_output_file = "https://drive.google.com/file/d/" + str(id_output)
39
+ print("Done all!")
40
+ return url_output_file
pdfparser_hq.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import pdf2image
3
+ import base64
4
+ import os
5
+ import time
6
+ from config import openai_api
7
+
8
+ client = OpenAI(api_key=openai_api)
9
+
10
+
11
+
12
+ def encode_image(image_path):
13
+ with open(image_path, "rb") as image_file:
14
+ return base64.b64encode(image_file.read()).decode('utf-8')
15
+
16
+
17
+
18
+ def pdf_to_image(path_to_pdf, get_output_in_code = False):
19
+ paths_to_img = []
20
+ print("Converting pdf to img")
21
+ start_time = time.time()
22
+ images = pdf2image.convert_from_path(path_to_pdf, dpi=100)
23
+ end_time = time.time()
24
+ execution_time = end_time - start_time
25
+ print("Conversion complete")
26
+ print("Execution time: {:.2f} seconds".format(execution_time))
27
+ os.makedirs(path_to_pdf.replace(".pdf", ""), exist_ok=True)
28
+ save_path = path_to_pdf.replace(".pdf", "") + "/png/"
29
+ print("Creating repository to store images")
30
+ os.makedirs(save_path, exist_ok=True)
31
+ print("Directory created : ", save_path)
32
+ for i, image in enumerate(images):
33
+ print(f"saving page {i} in {save_path}/{i}_page.png")
34
+ image.save(f"{save_path}{i}_page.png", "PNG")
35
+ paths_to_img.append(f"{save_path}/{i}_page.png")
36
+ if get_output_in_code:
37
+ return images, paths_to_img
38
+
39
+ def pdfs_folder_to_images(input_path, get_output_in_code = False):
40
+ pdf_files = []
41
+ images = {}
42
+ for root, dirs, files in os.walk(input_path):
43
+ for file in files:
44
+ if file.endswith('.pdf'):
45
+ print("FILE IS ", os.path.join(root, file))
46
+ pdf_files.append(os.path.join(root, file))
47
+ if get_output_in_code:
48
+ images[os.path.join(root, file)] = pdf_to_image(os.path.join(root, file), get_output_in_code=True)
49
+ else:
50
+ pdf_to_image(os.path.join(root, file))
51
+
52
+ if get_output_in_code:
53
+ return images
54
+
55
+
56
+ def img_to_txt(img):
57
+ response = client.chat.completions.create(
58
+ model="gpt-4o",
59
+ messages=[
60
+ {
61
+ "role": "system",
62
+ "content": "Your task is to transcribe and explain in English every single thing from screenshots sent by users"
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {
68
+ "type": "image_url",
69
+ "image_url": {
70
+ "url": f"data:image/jpeg;base64,{img}",
71
+ }
72
+ }
73
+ ]
74
+ }
75
+ ],
76
+ temperature=1,
77
+ max_tokens=1999,
78
+ top_p=1,
79
+ frequency_penalty=0,
80
+ presence_penalty=0
81
+ ).choices[0].message.content
82
+ return response
83
+
84
+
85
+ def img_to_txt_gemini(img):
86
+ return ""
87
+
88
+ def process_pdf_hq(path, get_output_in_code=True):
89
+ converted_pdf_router = pdfs_folder_to_images(path, get_output_in_code=True)
90
+ path_extracted_pdf = path+"/extracted_pdf/"
91
+ os.makedirs(path_extracted_pdf, exist_ok=True)
92
+ # paths_to_img
93
+ content_extracted = {}
94
+ for link in list(converted_pdf_router.keys()):
95
+ print("Working on ", link)
96
+ content_extracted[link] = []
97
+
98
+ for img_path in converted_pdf_router[link][1]:
99
+ print("Processing subimage")
100
+ base64_image = encode_image(img_path)
101
+ content = img_to_txt(base64_image)
102
+ # content = "Blank"
103
+ print(img_path)
104
+ content_extracted[link].append(content)
105
+ with open(f"{path_extracted_pdf}/PDF_FILE_{img_path.replace('/','_').replace('.','_')}.txt", "w") as fil:
106
+ fil.write(content)
107
+ if get_output_in_code:
108
+ return content_extracted
preprocessing.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # all in python script obv
2
+
3
+ import os
4
+ from pdfparser_hq import pdfs_folder_to_images
5
+
6
+ from google.oauth2 import service_account
7
+ from googleapiclient.discovery import build
8
+ import io
9
+ from googleapiclient.http import MediaIoBaseDownload
10
+ import gdown
11
+
12
+ def list_files_in_folder(service, folder_id):
13
+ results = service.files().list(
14
+ q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')",
15
+ pageSize=1000,
16
+ fields="nextPageToken, files(id, name)"
17
+ ).execute()
18
+ items = results.get('files', [])
19
+ return items
20
+
21
+ def download_file(service, file_id, file_name, save_path):
22
+ request = service.files().get_media(fileId=file_id)
23
+ fh = io.FileIO(os.path.join(save_path, file_name), 'wb')
24
+ downloader = MediaIoBaseDownload(fh, request)
25
+ done = False
26
+ while done is False:
27
+ status, done = downloader.next_chunk()
28
+ print(f"Download {file_name} {int(status.progress() * 100)}%.")
29
+
30
+
31
+ def download_files_from_folder(service, folder_id, save_path):
32
+ files = list_files_in_folder(service, folder_id)
33
+ for file in files:
34
+ # print(file)
35
+ download_file(service, file['id'], file['name'], save_path)
36
+
37
+
38
+
39
+ def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ):
40
+ """_summary_
41
+
42
+ Args:
43
+ data_path (_type_): path where to save data
44
+ source_folder_with_reciepts (_type_): folder_if where the reciepts are saved
45
+ link_to_csv (_type_): link gdrive to csv
46
+ """
47
+ breakpoint()
48
+
49
+ os.makedirs(data_path, exist_ok=True)
50
+ full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}'
51
+ print(full_link_to_csv)
52
+ transaction_csv_path = f'{data_path}/downloaded_file.csv'
53
+ breakpoint()
54
+ gdown.download(full_link_to_csv, transaction_csv_path, quiet=False)
55
+ SCOPES = ['https://www.googleapis.com/auth/drive']
56
+ SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
57
+
58
+ credentials = service_account.Credentials.from_service_account_file(
59
+ SERVICE_ACCOUNT_FILE, scopes=SCOPES)
60
+ service = build('drive', 'v3', credentials=credentials)
61
+
62
+ download_files_from_folder(service, source_folder_with_reciepts, data_path)
63
+
64
+ pdfs_folder_to_images(data_path)
reconciliate_and_upload.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from uploader import save_logs
2
+ import os
3
+ import pandas as pd
4
+ from rapidfuzz import process, fuzz
5
+ from random import randint
6
+ from time import sleep
7
+ #other imports (google API)
8
+ def fuzzy_match(row, choices, scorer, cutoff):
9
+ match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
10
+ if match:
11
+ return match[0]
12
+ return "missing receipt"
13
+ def reconciliate_and_upload(data_path,
14
+ name_of_csv,
15
+ folder_to_save_processed_reciepts,
16
+ folder_to_save_reconciled_data,
17
+ name_of_raw_transcripts = "transcript_raw.txt",
18
+ name_of_output = "[AI Generated] Output.xlsx" ):
19
+
20
+ with open(f"{data_path}/{name_of_raw_transcripts}") as file:
21
+ transcripts = eval(file.read())
22
+
23
+ imgs = []
24
+ path_to_pdfs =data_path
25
+
26
+ for root, dirs, files in os.walk(path_to_pdfs):
27
+ for file in files:
28
+ if file.endswith('.png'):
29
+ print(os.path.join(root, file))
30
+ imgs.append({"path": os.path.join(root, file)})
31
+ pass
32
+
33
+ list_transcripts_evaled = []
34
+ objects = []
35
+ for i,t in enumerate(transcripts):
36
+ content = eval(t["content"].replace('null', '-1'))
37
+ try:
38
+ obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
39
+ except:
40
+ print("sleeping a bit innit")
41
+ breakpoint()
42
+ sleep(randint(30,40))
43
+ obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
44
+
45
+ objects.append(obk)
46
+ print("uploaded image!")
47
+ try:
48
+ list_transcripts_evaled.append({
49
+ "path": imgs[i]["path"],
50
+ "name_of_supplier" :content["name_of_supplier"],
51
+ "amount":content["amount"],
52
+ "currency":content["currency"],
53
+ "date": content["date"]})
54
+ except:
55
+ breakpoint()
56
+ urls = []
57
+ for ob in objects:
58
+ url = "https://drive.google.com/file/d/" + ob["id"]
59
+ urls.append(url)
60
+ df_app = pd.DataFrame(list_transcripts_evaled)
61
+ df_app["amount"] = df_app["amount"].astype(str).str.replace(" ", "").str.replace(",", ".").str.replace("N/A", "-1").astype("float")
62
+ df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
63
+ df_app["url"] = urls
64
+ df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
65
+
66
+ df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
67
+ df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
68
+ df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
69
+ df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
70
+
71
+
72
+ merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
73
+ merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
74
+ # Merge on amount (Débit and amount)
75
+ merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
76
+
77
+
78
+
79
+ # Apply fuzzy matching
80
+ raw_choices = df_app['name_of_supplier'].tolist()
81
+ choices = []
82
+ for r in raw_choices:
83
+ choices.append(r.upper())
84
+ merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
85
+ merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
86
+ # Identify residuals in df that were not matched
87
+ df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
88
+
89
+ # Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
90
+ merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
91
+ # merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
92
+ merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
93
+ df_residuals_app.drop(columns=["path"], inplace=True)
94
+ merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
95
+ df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
96
+
97
+
98
+ with pd.ExcelWriter(name_of_output) as writer:
99
+ merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
100
+ df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
101
+
102
+
103
+
104
+ id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
105
+
106
+ return id_output
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ tabula-py
4
+ openai
5
+ gradio
6
+ pyPDF2
7
+ marker-pdf
8
+ groq
9
+ bs4
10
+ nltk
11
+ tiktoken
12
+ pdf2image
13
+ gdown
14
+ google-generativeai
15
+ google-auth
16
+ google-auth-oauthlib
17
+ google-auth-httplib2
18
+ google-api-python-client
19
+ chromadb
20
+ rapidfuzz
script_pdf_to_img.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+ os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
3
+ from pdfparser_hq import pdfs_folder_to_images
4
+ pdfs_folder_to_images(f"data")
uploader.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from google.oauth2 import service_account
3
+ from googleapiclient.discovery import build
4
+ from googleapiclient.http import MediaFileUpload
5
+ from datetime import datetime
6
+
7
+ def save_logs(path_to_data_to_save, name_to_save, folder_id = "16Vv728HPW2J0BYzgTaBV00nUEc5pRKT-"):
8
+
9
+ filename = path_to_data_to_save
10
+ SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
11
+
12
+ SCOPES = ['https://www.googleapis.com/auth/drive.file']
13
+
14
+ credentials = service_account.Credentials.from_service_account_file(
15
+ SERVICE_ACCOUNT_FILE, scopes=SCOPES)
16
+
17
+ service = build('drive', 'v3', credentials=credentials)
18
+
19
+
20
+ file_metadata = {
21
+ 'name': name_to_save, # Name of the file to be uploaded
22
+ 'parents': [folder_id] # Folder ID
23
+ }
24
+
25
+ file_path = filename
26
+
27
+ # Create a MediaFileUpload object to upload the file
28
+ media = MediaFileUpload(file_path)
29
+
30
+ file = service.files().create(
31
+ body=file_metadata,
32
+ media_body=media,
33
+ fields='id'
34
+ ).execute()
35
+
36
+ # Print the file ID of the uploaded file
37
+ print('Saved in Google Drive - File ID: %s' % file.get('id'))
38
+ return file