Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +41 -0
- main.py +41 -0
- reconciliate_and_upload.py +140 -0
app.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from config import json_url_id
|
2 |
+
from main import run_main
|
3 |
+
import gdown
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
download_url = f'https://drive.google.com/uc?id={json_url_id}'
|
9 |
+
output = 'secret_google_service_account.json'
|
10 |
+
gdown.download(download_url, output, quiet=False)
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
with gr.Blocks(title="Accountant automation",theme='nota-ai/theme') as demo:
|
19 |
+
|
20 |
+
with gr.Row():
|
21 |
+
with gr.Column(scale=6):
|
22 |
+
with gr.Row():
|
23 |
+
with gr.Column(scale=3):
|
24 |
+
source_folder_with_reciepts = gr.Textbox(placeholder="Dossier contenant les factures", lines=1)
|
25 |
+
link_to_csv = gr.Textbox(placeholder="Lien vers le relevé de compte en csv", lines=1, )
|
26 |
+
folder_to_save_processed_reciepts = gr.Textbox(placeholder="Dossier où sauvegarder les factures", lines=1, )
|
27 |
+
folder_to_save_reconciled_data = gr.Textbox(placeholder="Dossier où sauvegarder le tableau final", lines=1, )
|
28 |
+
name_output_file = gr.Textbox(placeholder="Nom du fichier de tableau final", lines=1, )
|
29 |
+
transaction_csv_path = gr.Textbox(placeholder="Company Name", lines=1, )
|
30 |
+
chat_submit_button = gr.Button(value="Submit ▶")
|
31 |
+
|
32 |
+
with gr.Column(scale=6):
|
33 |
+
chat_output = gr.Markdown("Appuyez sur valider pour lance le processing")
|
34 |
+
|
35 |
+
|
36 |
+
fn_chat = run_main
|
37 |
+
|
38 |
+
|
39 |
+
chat_submit_button.click(fn=fn_chat, inputs=[source_folder_with_reciepts, link_to_csv, folder_to_save_processed_reciepts, folder_to_save_reconciled_data, name_output_file, transaction_csv_path], outputs=[chat_output])
|
40 |
+
|
41 |
+
demo.launch(max_threads=40)
|
main.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#USER INPUT
|
2 |
+
from preprocessing import run_preprocessing
|
3 |
+
from ai_transcriber import transcribe_all
|
4 |
+
from reconciliate_and_upload import reconciliate_and_upload
|
5 |
+
import os
|
6 |
+
def run_main(
|
7 |
+
source_folder_with_reciepts,
|
8 |
+
link_to_csv,
|
9 |
+
folder_to_save_processed_reciepts,
|
10 |
+
folder_to_save_reconciled_data ,
|
11 |
+
name_output_file = "[AI generated] June 2024.xlsx",
|
12 |
+
transaction_csv_path = 'downloaded_file.csv',
|
13 |
+
data_path = "trial2"
|
14 |
+
):
|
15 |
+
os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
|
16 |
+
|
17 |
+
# breakpoint()
|
18 |
+
source_folder_with_reciepts = source_folder_with_reciepts.split("?")[0].split("/")[-1]
|
19 |
+
folder_to_save_processed_reciepts = folder_to_save_processed_reciepts.split("?")[0].split("/")[-1]
|
20 |
+
folder_to_save_reconciled_data = folder_to_save_reconciled_data.split("?")[0].split("/")[-1]
|
21 |
+
link_to_csv = link_to_csv.split("/view?")[0].split("/")[-1]
|
22 |
+
print("Extracted link csv id: ", link_to_csv)
|
23 |
+
name_output_file = name_output_file + ".xlsx"
|
24 |
+
name_output_file = name_output_file.replace(".xlsx.xlsx", ".xlsx")
|
25 |
+
# breakpoint()
|
26 |
+
run_preprocessing(data_path, source_folder_with_reciepts, link_to_csv)
|
27 |
+
print("Done pre-processing!")
|
28 |
+
transcribe_all(data_path)
|
29 |
+
print("Done transcription!")
|
30 |
+
id_output = reconciliate_and_upload(
|
31 |
+
data_path,
|
32 |
+
name_of_csv=transaction_csv_path,
|
33 |
+
folder_to_save_processed_reciepts=folder_to_save_processed_reciepts,
|
34 |
+
folder_to_save_reconciled_data = folder_to_save_reconciled_data,
|
35 |
+
name_of_output=name_output_file)
|
36 |
+
|
37 |
+
|
38 |
+
url_output_file = "https://drive.google.com/file/d/" + str(id_output)
|
39 |
+
display = f"[Voir tableau final]({url_output_file})"
|
40 |
+
print("Done all!")
|
41 |
+
return display
|
reconciliate_and_upload.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from uploader import save_logs
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
from rapidfuzz import process, fuzz
|
5 |
+
from random import randint
|
6 |
+
from time import sleep
|
7 |
+
#other imports (google API)
|
8 |
+
def fuzzy_match(row, choices, scorer, cutoff):
|
9 |
+
match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
|
10 |
+
if match:
|
11 |
+
return match[0]
|
12 |
+
return "missing receipt"
|
13 |
+
def reconciliate_and_upload(data_path,
|
14 |
+
name_of_csv,
|
15 |
+
folder_to_save_processed_reciepts,
|
16 |
+
folder_to_save_reconciled_data,
|
17 |
+
name_of_raw_transcripts = "transcript_raw.txt",
|
18 |
+
name_of_output = "[AI Generated] Output.xlsx" ):
|
19 |
+
|
20 |
+
with open(f"{data_path}/{name_of_raw_transcripts}") as file:
|
21 |
+
transcripts = eval(file.read())
|
22 |
+
|
23 |
+
imgs = []
|
24 |
+
path_to_pdfs =data_path
|
25 |
+
|
26 |
+
for root, dirs, files in os.walk(path_to_pdfs):
|
27 |
+
for file in files:
|
28 |
+
if file.endswith('.png'):
|
29 |
+
print(os.path.join(root, file))
|
30 |
+
imgs.append({"path": os.path.join(root, file)})
|
31 |
+
pass
|
32 |
+
|
33 |
+
list_transcripts_evaled = []
|
34 |
+
objects = []
|
35 |
+
for i,t in enumerate(transcripts):
|
36 |
+
content = eval(t["content"].replace('null', '-1'))
|
37 |
+
try:
|
38 |
+
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
|
39 |
+
except:
|
40 |
+
print("sleeping a bit innit")
|
41 |
+
breakpoint()
|
42 |
+
sleep(randint(30,40))
|
43 |
+
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
|
44 |
+
|
45 |
+
objects.append(obk)
|
46 |
+
print("uploaded image!")
|
47 |
+
try:
|
48 |
+
list_transcripts_evaled.append({
|
49 |
+
"path": imgs[i]["path"],
|
50 |
+
"name_of_supplier" :content["name_of_supplier"],
|
51 |
+
"amount":content["amount"],
|
52 |
+
"currency":content["currency"],
|
53 |
+
"date": content["date"]})
|
54 |
+
except:
|
55 |
+
breakpoint()
|
56 |
+
urls = []
|
57 |
+
for ob in objects:
|
58 |
+
url = "https://drive.google.com/file/d/" + ob["id"]
|
59 |
+
urls.append(url)
|
60 |
+
df_app = pd.DataFrame(list_transcripts_evaled)
|
61 |
+
float_regex = r'[-+]?\d*\.\d+|\d+'
|
62 |
+
df_app['amount'] = df_app['amount'].astype(str).str.extract(f'({float_regex})', expand=False)
|
63 |
+
|
64 |
+
# Replace spaces and commas, then handle N/A and convert to float
|
65 |
+
df_app['amount'] = (
|
66 |
+
df_app['amount']
|
67 |
+
.str.replace(" ", "", regex=False)
|
68 |
+
.str.replace(",", ".", regex=False)
|
69 |
+
.str.replace("N/A", "-1", regex=False)
|
70 |
+
.astype(float)
|
71 |
+
)
|
72 |
+
|
73 |
+
df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
|
74 |
+
df_app["url"] = urls
|
75 |
+
df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
|
76 |
+
|
77 |
+
df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
|
78 |
+
# df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
|
79 |
+
|
80 |
+
df_opp_app['Débit'] = df_opp_app['Débit'].astype(str).str.extract(f'({float_regex})', expand=False)
|
81 |
+
|
82 |
+
# Replace spaces and commas, then handle N/A and convert to float
|
83 |
+
df_opp_app['Débit'] = (
|
84 |
+
df_opp_app['Débit']
|
85 |
+
.str.replace(" ", "", regex=False)
|
86 |
+
.str.replace(",", ".", regex=False)
|
87 |
+
.str.replace("N/A", "-1", regex=False)
|
88 |
+
.astype(float)
|
89 |
+
)
|
90 |
+
# df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
|
91 |
+
|
92 |
+
|
93 |
+
df_opp_app['Crédit'] = df_opp_app['Crédit'].astype(str).str.extract(f'({float_regex})', expand=False)
|
94 |
+
|
95 |
+
# Replace spaces and commas, then handle N/A and convert to float
|
96 |
+
df_opp_app['Crédit'] = (
|
97 |
+
df_opp_app['Crédit']
|
98 |
+
.str.replace(" ", "", regex=False)
|
99 |
+
.str.replace(",", ".", regex=False)
|
100 |
+
.str.replace("N/A", "-1", regex=False)
|
101 |
+
.astype(float)
|
102 |
+
)
|
103 |
+
df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
|
104 |
+
|
105 |
+
|
106 |
+
merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
|
107 |
+
merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
|
108 |
+
# Merge on amount (Débit and amount)
|
109 |
+
merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
# Apply fuzzy matching
|
114 |
+
raw_choices = df_app['name_of_supplier'].tolist()
|
115 |
+
choices = []
|
116 |
+
for r in raw_choices:
|
117 |
+
choices.append(r.upper())
|
118 |
+
merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
|
119 |
+
merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
|
120 |
+
# Identify residuals in df that were not matched
|
121 |
+
df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
|
122 |
+
|
123 |
+
# Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
|
124 |
+
merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
|
125 |
+
# merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
|
126 |
+
merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
|
127 |
+
df_residuals_app.drop(columns=["path"], inplace=True)
|
128 |
+
merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
|
129 |
+
df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
|
130 |
+
|
131 |
+
|
132 |
+
with pd.ExcelWriter(name_of_output) as writer:
|
133 |
+
merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
|
134 |
+
df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
|
139 |
+
|
140 |
+
return id_output
|