Spaces:
Runtime error
Runtime error
# all in python script obv | |
import os | |
from pdfparser_hq import pdfs_folder_to_images | |
from google.oauth2 import service_account | |
from googleapiclient.discovery import build | |
import io | |
from googleapiclient.http import MediaIoBaseDownload | |
import gdown | |
def list_files_in_folder(service, folder_id): | |
results = service.files().list( | |
q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')", | |
pageSize=1000, | |
fields="nextPageToken, files(id, name)" | |
).execute() | |
items = results.get('files', []) | |
return items | |
def download_file(service, file_id, file_name, save_path): | |
request = service.files().get_media(fileId=file_id) | |
fh = io.FileIO(os.path.join(save_path, file_name), 'wb') | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while done is False: | |
status, done = downloader.next_chunk() | |
print(f"Download {file_name} {int(status.progress() * 100)}%.") | |
def download_files_from_folder(service, folder_id, save_path): | |
files = list_files_in_folder(service, folder_id) | |
for file in files: | |
# print(file) | |
download_file(service, file['id'], file['name'], save_path) | |
def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ): | |
"""_summary_ | |
Args: | |
data_path (_type_): path where to save data | |
source_folder_with_reciepts (_type_): folder_if where the reciepts are saved | |
link_to_csv (_type_): link gdrive to csv | |
""" | |
breakpoint() | |
os.makedirs(data_path, exist_ok=True) | |
full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}' | |
print(full_link_to_csv) | |
transaction_csv_path = f'{data_path}/downloaded_file.csv' | |
breakpoint() | |
gdown.download(full_link_to_csv, transaction_csv_path, quiet=False) | |
SCOPES = ['https://www.googleapis.com/auth/drive'] | |
SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json' | |
credentials = service_account.Credentials.from_service_account_file( | |
SERVICE_ACCOUNT_FILE, scopes=SCOPES) | |
service = build('drive', 'v3', credentials=credentials) | |
download_files_from_folder(service, source_folder_with_reciepts, data_path) | |
pdfs_folder_to_images(data_path) |