Spaces:
Runtime error
Runtime error
Delete reconciliate_and_upload.py
Browse files- reconciliate_and_upload.py +0 -106
reconciliate_and_upload.py
DELETED
@@ -1,106 +0,0 @@
|
|
1 |
-
from uploader import save_logs
|
2 |
-
import os
|
3 |
-
import pandas as pd
|
4 |
-
from rapidfuzz import process, fuzz
|
5 |
-
from random import randint
|
6 |
-
from time import sleep
|
7 |
-
#other imports (google API)
|
8 |
-
def fuzzy_match(row, choices, scorer, cutoff):
|
9 |
-
match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
|
10 |
-
if match:
|
11 |
-
return match[0]
|
12 |
-
return "missing receipt"
|
13 |
-
def reconciliate_and_upload(data_path,
|
14 |
-
name_of_csv,
|
15 |
-
folder_to_save_processed_reciepts,
|
16 |
-
folder_to_save_reconciled_data,
|
17 |
-
name_of_raw_transcripts = "transcript_raw.txt",
|
18 |
-
name_of_output = "[AI Generated] Output.xlsx" ):
|
19 |
-
|
20 |
-
with open(f"{data_path}/{name_of_raw_transcripts}") as file:
|
21 |
-
transcripts = eval(file.read())
|
22 |
-
|
23 |
-
imgs = []
|
24 |
-
path_to_pdfs =data_path
|
25 |
-
|
26 |
-
for root, dirs, files in os.walk(path_to_pdfs):
|
27 |
-
for file in files:
|
28 |
-
if file.endswith('.png'):
|
29 |
-
print(os.path.join(root, file))
|
30 |
-
imgs.append({"path": os.path.join(root, file)})
|
31 |
-
pass
|
32 |
-
|
33 |
-
list_transcripts_evaled = []
|
34 |
-
objects = []
|
35 |
-
for i,t in enumerate(transcripts):
|
36 |
-
content = eval(t["content"].replace('null', '-1'))
|
37 |
-
try:
|
38 |
-
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
|
39 |
-
except:
|
40 |
-
print("sleeping a bit innit")
|
41 |
-
breakpoint()
|
42 |
-
sleep(randint(30,40))
|
43 |
-
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
|
44 |
-
|
45 |
-
objects.append(obk)
|
46 |
-
print("uploaded image!")
|
47 |
-
try:
|
48 |
-
list_transcripts_evaled.append({
|
49 |
-
"path": imgs[i]["path"],
|
50 |
-
"name_of_supplier" :content["name_of_supplier"],
|
51 |
-
"amount":content["amount"],
|
52 |
-
"currency":content["currency"],
|
53 |
-
"date": content["date"]})
|
54 |
-
except:
|
55 |
-
breakpoint()
|
56 |
-
urls = []
|
57 |
-
for ob in objects:
|
58 |
-
url = "https://drive.google.com/file/d/" + ob["id"]
|
59 |
-
urls.append(url)
|
60 |
-
df_app = pd.DataFrame(list_transcripts_evaled)
|
61 |
-
df_app["amount"] = df_app["amount"].astype(str).str.replace(" ", "").str.replace(",", ".").str.replace("N/A", "-1").astype("float")
|
62 |
-
df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
|
63 |
-
df_app["url"] = urls
|
64 |
-
df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
|
65 |
-
|
66 |
-
df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
|
67 |
-
df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
|
68 |
-
df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
|
69 |
-
df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
|
70 |
-
|
71 |
-
|
72 |
-
merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
|
73 |
-
merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
|
74 |
-
# Merge on amount (Débit and amount)
|
75 |
-
merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
# Apply fuzzy matching
|
80 |
-
raw_choices = df_app['name_of_supplier'].tolist()
|
81 |
-
choices = []
|
82 |
-
for r in raw_choices:
|
83 |
-
choices.append(r.upper())
|
84 |
-
merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
|
85 |
-
merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
|
86 |
-
# Identify residuals in df that were not matched
|
87 |
-
df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
|
88 |
-
|
89 |
-
# Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
|
90 |
-
merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
|
91 |
-
# merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
|
92 |
-
merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
|
93 |
-
df_residuals_app.drop(columns=["path"], inplace=True)
|
94 |
-
merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
|
95 |
-
df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
|
96 |
-
|
97 |
-
|
98 |
-
with pd.ExcelWriter(name_of_output) as writer:
|
99 |
-
merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
|
100 |
-
df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
|
105 |
-
|
106 |
-
return id_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|