Adr740 commited on
Commit
9b91941
·
verified ·
1 Parent(s): 1d14876

Delete reconciliate_and_upload.py

Browse files
Files changed (1) hide show
  1. reconciliate_and_upload.py +0 -106
reconciliate_and_upload.py DELETED
@@ -1,106 +0,0 @@
1
- from uploader import save_logs
2
- import os
3
- import pandas as pd
4
- from rapidfuzz import process, fuzz
5
- from random import randint
6
- from time import sleep
7
- #other imports (google API)
8
- def fuzzy_match(row, choices, scorer, cutoff):
9
- match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
10
- if match:
11
- return match[0]
12
- return "missing receipt"
13
- def reconciliate_and_upload(data_path,
14
- name_of_csv,
15
- folder_to_save_processed_reciepts,
16
- folder_to_save_reconciled_data,
17
- name_of_raw_transcripts = "transcript_raw.txt",
18
- name_of_output = "[AI Generated] Output.xlsx" ):
19
-
20
- with open(f"{data_path}/{name_of_raw_transcripts}") as file:
21
- transcripts = eval(file.read())
22
-
23
- imgs = []
24
- path_to_pdfs =data_path
25
-
26
- for root, dirs, files in os.walk(path_to_pdfs):
27
- for file in files:
28
- if file.endswith('.png'):
29
- print(os.path.join(root, file))
30
- imgs.append({"path": os.path.join(root, file)})
31
- pass
32
-
33
- list_transcripts_evaled = []
34
- objects = []
35
- for i,t in enumerate(transcripts):
36
- content = eval(t["content"].replace('null', '-1'))
37
- try:
38
- obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
39
- except:
40
- print("sleeping a bit innit")
41
- breakpoint()
42
- sleep(randint(30,40))
43
- obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
44
-
45
- objects.append(obk)
46
- print("uploaded image!")
47
- try:
48
- list_transcripts_evaled.append({
49
- "path": imgs[i]["path"],
50
- "name_of_supplier" :content["name_of_supplier"],
51
- "amount":content["amount"],
52
- "currency":content["currency"],
53
- "date": content["date"]})
54
- except:
55
- breakpoint()
56
- urls = []
57
- for ob in objects:
58
- url = "https://drive.google.com/file/d/" + ob["id"]
59
- urls.append(url)
60
- df_app = pd.DataFrame(list_transcripts_evaled)
61
- df_app["amount"] = df_app["amount"].astype(str).str.replace(" ", "").str.replace(",", ".").str.replace("N/A", "-1").astype("float")
62
- df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
63
- df_app["url"] = urls
64
- df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
65
-
66
- df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
67
- df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
68
- df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
69
- df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
70
-
71
-
72
- merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
73
- merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
74
- # Merge on amount (Débit and amount)
75
- merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
76
-
77
-
78
-
79
- # Apply fuzzy matching
80
- raw_choices = df_app['name_of_supplier'].tolist()
81
- choices = []
82
- for r in raw_choices:
83
- choices.append(r.upper())
84
- merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
85
- merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
86
- # Identify residuals in df that were not matched
87
- df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
88
-
89
- # Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
90
- merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
91
- # merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
92
- merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
93
- df_residuals_app.drop(columns=["path"], inplace=True)
94
- merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
95
- df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
96
-
97
-
98
- with pd.ExcelWriter(name_of_output) as writer:
99
- merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
100
- df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
101
-
102
-
103
-
104
- id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
105
-
106
- return id_output