Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- ai_transcriber.py +73 -0
- main.py +40 -0
- pdfparser_hq.py +108 -0
- preprocessing.py +64 -0
- reconciliate_and_upload.py +106 -0
- requirements.txt +20 -0
- script_pdf_to_img.py +4 -0
- uploader.py +38 -0
ai_transcriber.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# input : data_path
|
2 |
+
import os
|
3 |
+
from pdfparser_hq import encode_image
|
4 |
+
from config import openai_api
|
5 |
+
from openai import OpenAI
|
6 |
+
|
7 |
+
def transcribe_all(data_path,
|
8 |
+
name_of_raw_transcripts = "transcript_raw.txt"):
|
9 |
+
imgs = []
|
10 |
+
client = OpenAI(api_key=openai_api)
|
11 |
+
transcripts = []
|
12 |
+
|
13 |
+
system_prompt = """
|
14 |
+
You will be given a reciept that could be handwritten or properly formated. Your goal is to transcribe what is written in JSON following this format:
|
15 |
+
|
16 |
+
{
|
17 |
+
"name_of_supplier" : X,
|
18 |
+
"amount" : X,
|
19 |
+
"currency": X,
|
20 |
+
"date" : DD/MM/YYYY
|
21 |
+
}
|
22 |
+
|
23 |
+
Make sure you provide the total amount and the correct dates, handwritten ones might be tricky. This will be used to reconcile with banking transactions.
|
24 |
+
|
25 |
+
"""
|
26 |
+
|
27 |
+
for root, dirs, files in os.walk(data_path):
|
28 |
+
for file in files:
|
29 |
+
if file.endswith('.png'):
|
30 |
+
print(os.path.join(root, file))
|
31 |
+
imgs.append({"path": os.path.join(root, file)})
|
32 |
+
pass
|
33 |
+
|
34 |
+
|
35 |
+
for i, img in enumerate(imgs):
|
36 |
+
filename = img["path"]
|
37 |
+
base64_image = encode_image(img["path"])
|
38 |
+
|
39 |
+
response = client.chat.completions.create(
|
40 |
+
model="gpt-4o",
|
41 |
+
messages=[
|
42 |
+
{
|
43 |
+
"role": "system",
|
44 |
+
"content": [
|
45 |
+
{
|
46 |
+
"type": "text",
|
47 |
+
"text": system_prompt
|
48 |
+
}
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"role": "user",
|
53 |
+
"content": [
|
54 |
+
{
|
55 |
+
"type": "image_url",
|
56 |
+
"image_url": {
|
57 |
+
"url": f"data:image/jpeg;base64,{base64_image}",
|
58 |
+
}
|
59 |
+
}
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
temperature=1,
|
64 |
+
max_tokens=1877,
|
65 |
+
top_p=1,
|
66 |
+
response_format={"type": "json_object"},
|
67 |
+
frequency_penalty=0,
|
68 |
+
presence_penalty=0
|
69 |
+
).choices[0].message.content
|
70 |
+
transcripts.append({"path": filename,"filename" : f"P{i+1}.png", "content": response})
|
71 |
+
print(f"done transcribing transcript: {i+1}/{len(imgs)}")
|
72 |
+
with open(f"{data_path}/{name_of_raw_transcripts}", 'w') as file:
|
73 |
+
file.write(str(transcripts))
|
main.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#USER INPUT
|
2 |
+
from preprocessing import run_preprocessing
|
3 |
+
from ai_transcriber import transcribe_all
|
4 |
+
from reconciliate_and_upload import reconciliate_and_upload
|
5 |
+
import os
|
6 |
+
def run_main(
|
7 |
+
source_folder_with_reciepts = "https://drive.google.com/drive/folders/1skbgiXMnAe3z2r8E9oLAxXxkDBnrk8l4?usp=sharing",
|
8 |
+
link_to_csv = "https://drive.google.com/file/d/1cYoj8U5mttwQu5hNoupifHtjESCIHpsp/view?usp=sharing",
|
9 |
+
folder_to_save_processed_reciepts = "https://drive.google.com/drive/folders/1zADJlZ8pvXHNdAhbrxScPynSq1m5Jo1C?usp=sharing",
|
10 |
+
folder_to_save_reconciled_data = "https://drive.google.com/drive/folders/1bmrHExKt0x5AJwJsMtwW1Yk6Hof4WbCF?usp=drive_link",
|
11 |
+
name_output_file = "[AI generated] June 2024.xlsx",
|
12 |
+
transaction_csv_path = 'downloaded_file.csv',
|
13 |
+
data_path = "trial2"
|
14 |
+
):
|
15 |
+
os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
|
16 |
+
|
17 |
+
# breakpoint()
|
18 |
+
source_folder_with_reciepts = source_folder_with_reciepts.split("?")[0].split("/")[-1]
|
19 |
+
folder_to_save_processed_reciepts = folder_to_save_processed_reciepts.split("?")[0].split("/")[-1]
|
20 |
+
folder_to_save_reconciled_data = folder_to_save_reconciled_data.split("?")[0].split("/")[-1]
|
21 |
+
link_to_csv = link_to_csv.split("/view?")[0].split("/")[-1]
|
22 |
+
print("Extracted link csv id: ", link_to_csv)
|
23 |
+
name_output_file = name_output_file + ".xlsx"
|
24 |
+
name_output_file = name_output_file.replace(".xlsx.xlsx", ".xlsx")
|
25 |
+
# breakpoint()
|
26 |
+
run_preprocessing(data_path, source_folder_with_reciepts, link_to_csv)
|
27 |
+
print("Done pre-processing!")
|
28 |
+
transcribe_all(data_path)
|
29 |
+
print("Done transcription!")
|
30 |
+
id_output = reconciliate_and_upload(
|
31 |
+
data_path,
|
32 |
+
name_of_csv=transaction_csv_path,
|
33 |
+
folder_to_save_processed_reciepts=folder_to_save_processed_reciepts,
|
34 |
+
folder_to_save_reconciled_data = folder_to_save_reconciled_data,
|
35 |
+
name_of_output=name_output_file)
|
36 |
+
|
37 |
+
|
38 |
+
url_output_file = "https://drive.google.com/file/d/" + str(id_output)
|
39 |
+
print("Done all!")
|
40 |
+
return url_output_file
|
pdfparser_hq.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import pdf2image
|
3 |
+
import base64
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
from config import openai_api
|
7 |
+
|
8 |
+
client = OpenAI(api_key=openai_api)
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def encode_image(image_path):
|
13 |
+
with open(image_path, "rb") as image_file:
|
14 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def pdf_to_image(path_to_pdf, get_output_in_code = False):
|
19 |
+
paths_to_img = []
|
20 |
+
print("Converting pdf to img")
|
21 |
+
start_time = time.time()
|
22 |
+
images = pdf2image.convert_from_path(path_to_pdf, dpi=100)
|
23 |
+
end_time = time.time()
|
24 |
+
execution_time = end_time - start_time
|
25 |
+
print("Conversion complete")
|
26 |
+
print("Execution time: {:.2f} seconds".format(execution_time))
|
27 |
+
os.makedirs(path_to_pdf.replace(".pdf", ""), exist_ok=True)
|
28 |
+
save_path = path_to_pdf.replace(".pdf", "") + "/png/"
|
29 |
+
print("Creating repository to store images")
|
30 |
+
os.makedirs(save_path, exist_ok=True)
|
31 |
+
print("Directory created : ", save_path)
|
32 |
+
for i, image in enumerate(images):
|
33 |
+
print(f"saving page {i} in {save_path}/{i}_page.png")
|
34 |
+
image.save(f"{save_path}{i}_page.png", "PNG")
|
35 |
+
paths_to_img.append(f"{save_path}/{i}_page.png")
|
36 |
+
if get_output_in_code:
|
37 |
+
return images, paths_to_img
|
38 |
+
|
39 |
+
def pdfs_folder_to_images(input_path, get_output_in_code = False):
|
40 |
+
pdf_files = []
|
41 |
+
images = {}
|
42 |
+
for root, dirs, files in os.walk(input_path):
|
43 |
+
for file in files:
|
44 |
+
if file.endswith('.pdf'):
|
45 |
+
print("FILE IS ", os.path.join(root, file))
|
46 |
+
pdf_files.append(os.path.join(root, file))
|
47 |
+
if get_output_in_code:
|
48 |
+
images[os.path.join(root, file)] = pdf_to_image(os.path.join(root, file), get_output_in_code=True)
|
49 |
+
else:
|
50 |
+
pdf_to_image(os.path.join(root, file))
|
51 |
+
|
52 |
+
if get_output_in_code:
|
53 |
+
return images
|
54 |
+
|
55 |
+
|
56 |
+
def img_to_txt(img):
|
57 |
+
response = client.chat.completions.create(
|
58 |
+
model="gpt-4o",
|
59 |
+
messages=[
|
60 |
+
{
|
61 |
+
"role": "system",
|
62 |
+
"content": "Your task is to transcribe and explain in English every single thing from screenshots sent by users"
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"role": "user",
|
66 |
+
"content": [
|
67 |
+
{
|
68 |
+
"type": "image_url",
|
69 |
+
"image_url": {
|
70 |
+
"url": f"data:image/jpeg;base64,{img}",
|
71 |
+
}
|
72 |
+
}
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
temperature=1,
|
77 |
+
max_tokens=1999,
|
78 |
+
top_p=1,
|
79 |
+
frequency_penalty=0,
|
80 |
+
presence_penalty=0
|
81 |
+
).choices[0].message.content
|
82 |
+
return response
|
83 |
+
|
84 |
+
|
85 |
+
def img_to_txt_gemini(img):
|
86 |
+
return ""
|
87 |
+
|
88 |
+
def process_pdf_hq(path, get_output_in_code=True):
|
89 |
+
converted_pdf_router = pdfs_folder_to_images(path, get_output_in_code=True)
|
90 |
+
path_extracted_pdf = path+"/extracted_pdf/"
|
91 |
+
os.makedirs(path_extracted_pdf, exist_ok=True)
|
92 |
+
# paths_to_img
|
93 |
+
content_extracted = {}
|
94 |
+
for link in list(converted_pdf_router.keys()):
|
95 |
+
print("Working on ", link)
|
96 |
+
content_extracted[link] = []
|
97 |
+
|
98 |
+
for img_path in converted_pdf_router[link][1]:
|
99 |
+
print("Processing subimage")
|
100 |
+
base64_image = encode_image(img_path)
|
101 |
+
content = img_to_txt(base64_image)
|
102 |
+
# content = "Blank"
|
103 |
+
print(img_path)
|
104 |
+
content_extracted[link].append(content)
|
105 |
+
with open(f"{path_extracted_pdf}/PDF_FILE_{img_path.replace('/','_').replace('.','_')}.txt", "w") as fil:
|
106 |
+
fil.write(content)
|
107 |
+
if get_output_in_code:
|
108 |
+
return content_extracted
|
preprocessing.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# all in python script obv
|
2 |
+
|
3 |
+
import os
|
4 |
+
from pdfparser_hq import pdfs_folder_to_images
|
5 |
+
|
6 |
+
from google.oauth2 import service_account
|
7 |
+
from googleapiclient.discovery import build
|
8 |
+
import io
|
9 |
+
from googleapiclient.http import MediaIoBaseDownload
|
10 |
+
import gdown
|
11 |
+
|
12 |
+
def list_files_in_folder(service, folder_id):
|
13 |
+
results = service.files().list(
|
14 |
+
q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')",
|
15 |
+
pageSize=1000,
|
16 |
+
fields="nextPageToken, files(id, name)"
|
17 |
+
).execute()
|
18 |
+
items = results.get('files', [])
|
19 |
+
return items
|
20 |
+
|
21 |
+
def download_file(service, file_id, file_name, save_path):
|
22 |
+
request = service.files().get_media(fileId=file_id)
|
23 |
+
fh = io.FileIO(os.path.join(save_path, file_name), 'wb')
|
24 |
+
downloader = MediaIoBaseDownload(fh, request)
|
25 |
+
done = False
|
26 |
+
while done is False:
|
27 |
+
status, done = downloader.next_chunk()
|
28 |
+
print(f"Download {file_name} {int(status.progress() * 100)}%.")
|
29 |
+
|
30 |
+
|
31 |
+
def download_files_from_folder(service, folder_id, save_path):
|
32 |
+
files = list_files_in_folder(service, folder_id)
|
33 |
+
for file in files:
|
34 |
+
# print(file)
|
35 |
+
download_file(service, file['id'], file['name'], save_path)
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ):
|
40 |
+
"""_summary_
|
41 |
+
|
42 |
+
Args:
|
43 |
+
data_path (_type_): path where to save data
|
44 |
+
source_folder_with_reciepts (_type_): folder_if where the reciepts are saved
|
45 |
+
link_to_csv (_type_): link gdrive to csv
|
46 |
+
"""
|
47 |
+
breakpoint()
|
48 |
+
|
49 |
+
os.makedirs(data_path, exist_ok=True)
|
50 |
+
full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}'
|
51 |
+
print(full_link_to_csv)
|
52 |
+
transaction_csv_path = f'{data_path}/downloaded_file.csv'
|
53 |
+
breakpoint()
|
54 |
+
gdown.download(full_link_to_csv, transaction_csv_path, quiet=False)
|
55 |
+
SCOPES = ['https://www.googleapis.com/auth/drive']
|
56 |
+
SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
|
57 |
+
|
58 |
+
credentials = service_account.Credentials.from_service_account_file(
|
59 |
+
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
|
60 |
+
service = build('drive', 'v3', credentials=credentials)
|
61 |
+
|
62 |
+
download_files_from_folder(service, source_folder_with_reciepts, data_path)
|
63 |
+
|
64 |
+
pdfs_folder_to_images(data_path)
|
reconciliate_and_upload.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from uploader import save_logs
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
from rapidfuzz import process, fuzz
|
5 |
+
from random import randint
|
6 |
+
from time import sleep
|
7 |
+
#other imports (google API)
|
8 |
+
def fuzzy_match(row, choices, scorer, cutoff):
|
9 |
+
match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
|
10 |
+
if match:
|
11 |
+
return match[0]
|
12 |
+
return "missing receipt"
|
13 |
+
def reconciliate_and_upload(data_path,
|
14 |
+
name_of_csv,
|
15 |
+
folder_to_save_processed_reciepts,
|
16 |
+
folder_to_save_reconciled_data,
|
17 |
+
name_of_raw_transcripts = "transcript_raw.txt",
|
18 |
+
name_of_output = "[AI Generated] Output.xlsx" ):
|
19 |
+
|
20 |
+
with open(f"{data_path}/{name_of_raw_transcripts}") as file:
|
21 |
+
transcripts = eval(file.read())
|
22 |
+
|
23 |
+
imgs = []
|
24 |
+
path_to_pdfs =data_path
|
25 |
+
|
26 |
+
for root, dirs, files in os.walk(path_to_pdfs):
|
27 |
+
for file in files:
|
28 |
+
if file.endswith('.png'):
|
29 |
+
print(os.path.join(root, file))
|
30 |
+
imgs.append({"path": os.path.join(root, file)})
|
31 |
+
pass
|
32 |
+
|
33 |
+
list_transcripts_evaled = []
|
34 |
+
objects = []
|
35 |
+
for i,t in enumerate(transcripts):
|
36 |
+
content = eval(t["content"].replace('null', '-1'))
|
37 |
+
try:
|
38 |
+
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
|
39 |
+
except:
|
40 |
+
print("sleeping a bit innit")
|
41 |
+
breakpoint()
|
42 |
+
sleep(randint(30,40))
|
43 |
+
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
|
44 |
+
|
45 |
+
objects.append(obk)
|
46 |
+
print("uploaded image!")
|
47 |
+
try:
|
48 |
+
list_transcripts_evaled.append({
|
49 |
+
"path": imgs[i]["path"],
|
50 |
+
"name_of_supplier" :content["name_of_supplier"],
|
51 |
+
"amount":content["amount"],
|
52 |
+
"currency":content["currency"],
|
53 |
+
"date": content["date"]})
|
54 |
+
except:
|
55 |
+
breakpoint()
|
56 |
+
urls = []
|
57 |
+
for ob in objects:
|
58 |
+
url = "https://drive.google.com/file/d/" + ob["id"]
|
59 |
+
urls.append(url)
|
60 |
+
df_app = pd.DataFrame(list_transcripts_evaled)
|
61 |
+
df_app["amount"] = df_app["amount"].astype(str).str.replace(" ", "").str.replace(",", ".").str.replace("N/A", "-1").astype("float")
|
62 |
+
df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
|
63 |
+
df_app["url"] = urls
|
64 |
+
df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
|
65 |
+
|
66 |
+
df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
|
67 |
+
df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
|
68 |
+
df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
|
69 |
+
df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
|
70 |
+
|
71 |
+
|
72 |
+
merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
|
73 |
+
merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
|
74 |
+
# Merge on amount (Débit and amount)
|
75 |
+
merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
# Apply fuzzy matching
|
80 |
+
raw_choices = df_app['name_of_supplier'].tolist()
|
81 |
+
choices = []
|
82 |
+
for r in raw_choices:
|
83 |
+
choices.append(r.upper())
|
84 |
+
merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
|
85 |
+
merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
|
86 |
+
# Identify residuals in df that were not matched
|
87 |
+
df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
|
88 |
+
|
89 |
+
# Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
|
90 |
+
merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
|
91 |
+
# merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
|
92 |
+
merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
|
93 |
+
df_residuals_app.drop(columns=["path"], inplace=True)
|
94 |
+
merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
|
95 |
+
df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
|
96 |
+
|
97 |
+
|
98 |
+
with pd.ExcelWriter(name_of_output) as writer:
|
99 |
+
merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
|
100 |
+
df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
|
105 |
+
|
106 |
+
return id_output
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
tabula-py
|
4 |
+
openai
|
5 |
+
gradio
|
6 |
+
pyPDF2
|
7 |
+
marker-pdf
|
8 |
+
groq
|
9 |
+
bs4
|
10 |
+
nltk
|
11 |
+
tiktoken
|
12 |
+
pdf2image
|
13 |
+
gdown
|
14 |
+
google-generativeai
|
15 |
+
google-auth
|
16 |
+
google-auth-oauthlib
|
17 |
+
google-auth-httplib2
|
18 |
+
google-api-python-client
|
19 |
+
chromadb
|
20 |
+
rapidfuzz
|
script_pdf_to_img.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
|
3 |
+
from pdfparser_hq import pdfs_folder_to_images
|
4 |
+
pdfs_folder_to_images(f"data")
|
uploader.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from google.oauth2 import service_account
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from googleapiclient.http import MediaFileUpload
|
5 |
+
from datetime import datetime
|
6 |
+
|
7 |
+
def save_logs(path_to_data_to_save, name_to_save, folder_id = "16Vv728HPW2J0BYzgTaBV00nUEc5pRKT-"):
|
8 |
+
|
9 |
+
filename = path_to_data_to_save
|
10 |
+
SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
|
11 |
+
|
12 |
+
SCOPES = ['https://www.googleapis.com/auth/drive.file']
|
13 |
+
|
14 |
+
credentials = service_account.Credentials.from_service_account_file(
|
15 |
+
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
|
16 |
+
|
17 |
+
service = build('drive', 'v3', credentials=credentials)
|
18 |
+
|
19 |
+
|
20 |
+
file_metadata = {
|
21 |
+
'name': name_to_save, # Name of the file to be uploaded
|
22 |
+
'parents': [folder_id] # Folder ID
|
23 |
+
}
|
24 |
+
|
25 |
+
file_path = filename
|
26 |
+
|
27 |
+
# Create a MediaFileUpload object to upload the file
|
28 |
+
media = MediaFileUpload(file_path)
|
29 |
+
|
30 |
+
file = service.files().create(
|
31 |
+
body=file_metadata,
|
32 |
+
media_body=media,
|
33 |
+
fields='id'
|
34 |
+
).execute()
|
35 |
+
|
36 |
+
# Print the file ID of the uploaded file
|
37 |
+
print('Saved in Google Drive - File ID: %s' % file.get('id'))
|
38 |
+
return file
|