import os, sys, inspect, shutil, warnings from dataclasses import dataclass, field import pandas as pd currentdir = os.path.dirname(os.path.dirname(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) sys.path.append(parentdir) sys.path.append(currentdir) from vouchervision.general_utils import import_csv, import_tsv, bcolors from vouchervision.general_utils import Print_Verbose, print_main_warn, print_main_success, make_file_names_valid, make_images_in_dir_vertical from vouchervision.utils_GBIF import generate_image_filename from vouchervision.download_from_GBIF_all_images_in_file import download_all_images_from_GBIF_LM2 from PIL import Image from tqdm import tqdm from pathlib import Path import fitz def convert_pdf_to_jpg(source_pdf, destination_dir, dpi=100): doc = fitz.open(source_pdf) for page_num in range(len(doc)): page = doc.load_page(page_num) # Load the current page pix = page.get_pixmap(dpi=dpi) # Render page to an image output_filename = f"{os.path.splitext(os.path.basename(source_pdf))[0]}__{10000 + page_num + 1}.jpg" output_filepath = os.path.join(destination_dir, output_filename) pix.save(output_filepath) # Save the image length_doc = len(doc) doc.close() return length_doc @dataclass class Project_Info(): batch_size: int = 50 image_location: str = '' dir_images: str = '' project_data: object = field(init=False) project_data_list: object = field(init=False) path_csv_combined: str = '' path_csv_occ: str = '' path_csv_img: str = '' csv_combined: str = '' csv_occ: str = '' csv_img: str = '' Dirs: object = field(init=False) has_valid_images: bool = True def __init__(self, cfg, logger, dir_home, Dirs) -> None: self.Dirs = Dirs logger.name = 'Project Info' logger.info("Gathering Images and Image Metadata") self.logger = logger self.batch_size = cfg['leafmachine']['project']['batch_size'] self.image_location = cfg['leafmachine']['project']['image_location'] self.valid_extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.bmp', '.BMP', '.tif', '.TIF', '.tiff', '.TIFF'] self.copy_images_to_project_dir(cfg['leafmachine']['project']['dir_images_local'], Dirs) self.make_file_names_custom(Dirs.save_original, cfg, Dirs) # If project is local, expect: # dir with images # path to images.csv # path to occ.csv # OR path to combined.csv # if self.image_location in ['local','l','L','Local']: self.__import_local_files(cfg, logger, Dirs) # If project is GBIF, expect: # Darwin Core Images (or multimedia.txt) and Occurrences file pair, either .txt or .csv # elif self.image_location in ['GBIF','g','G','gbif']: # self.__import_GBIF_files_post_download(cfg, logger, dir_home) self.__make_project_dict(Dirs) #, self.batch_size) # Make sure image file names are legal make_file_names_valid(Dirs.save_original, cfg) # Make all images vertical make_images_in_dir_vertical(Dirs.save_original, cfg) @property def has_valid_images(self): return self.check_for_images() @property def file_ext(self): return f"{['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.bmp', '.BMP', '.tif', '.TIF', '.tiff', '.TIFF']}" def check_for_images(self): for filename in os.listdir(self.dir_images): if filename.endswith(tuple(self.valid_extensions)): return True return False def remove_non_numbers(self, s): return ''.join([char for char in s if char.isdigit()]) # def copy_images_to_project_dir(self, dir_images, Dirs): # n_total = len(os.listdir(dir_images)) # for file in tqdm(os.listdir(dir_images), desc=f'{bcolors.HEADER} Copying images to working directory{bcolors.ENDC}',colour="white",position=0,total = n_total): # # Copy og image to new dir # # Copied image will be used for all downstream applications # source = os.path.join(dir_images, file) # destination = os.path.join(Dirs.save_original, file) # shutil.copy(source, destination) def copy_images_to_project_dir(self, dir_images, Dirs): n_total = len(os.listdir(dir_images)) for file in tqdm(os.listdir(dir_images), desc=f'{bcolors.HEADER} Copying images to working directory{bcolors.ENDC}', colour="white", position=0, total=n_total): source = os.path.join(dir_images, file) # Split the filename and the extension filename, ext = os.path.splitext(file) # Convert the extension to lower case ext_lower = ext.lower() # Reconstruct the file name with the lower-cased extension file_with_lower_ext = f"{filename}{ext_lower}" # Check if file is a PDF if ext_lower == '.pdf': # Convert PDF pages to JPG images n_pages = convert_pdf_to_jpg(source, Dirs.save_original) self.logger.info(f"Converted {n_pages} pages to JPG from PDF: {file}") else: # Copy non-PDF files directly, using the filename with the lower-cased extension destination = os.path.join(Dirs.save_original, file_with_lower_ext) shutil.copy(source, destination) def make_file_names_custom(self, dir_images, cfg, Dirs): n_total = len(os.listdir(dir_images)) for file in tqdm(os.listdir(dir_images), desc=f'{bcolors.HEADER} Creating Catalog Number from file name{bcolors.ENDC}',colour="green",position=0,total = n_total): # Copy og image to new dir # Copied image will be used for all downstream applications # source = os.path.join(dir_images, file) # destination = os.path.join(Dirs.save_original, file) # shutil.copy(source, destination) if cfg['leafmachine']['project']['catalog_numerical_only'] or cfg['leafmachine']['project']['prefix_removal'] or cfg['leafmachine']['project']['suffix_removal']: name = Path(file).stem ext = Path(file).suffix if cfg['leafmachine']['project']['prefix_removal']: name_cleaned = name.replace(cfg['leafmachine']['project']['prefix_removal'], "") if cfg['leafmachine']['project']['suffix_removal']: name_cleaned = name.replace(cfg['leafmachine']['project']['suffix_removal'], "") if cfg['leafmachine']['project']['catalog_numerical_only']: name_cleaned = self.remove_non_numbers(name) name_new = ''.join([name_cleaned,ext]) i = 0 try: os.rename(os.path.join(dir_images,file), os.path.join(dir_images,name_new)) except: warnings.warn("WARNING: duplicate file names will result given the current selections for 'prefix_removal', 'suffix_removal', or 'catalog_numerical_only'. Change them before continuing.") warnings.warn("The affected file name has not been changed.") def __create_combined_csv(self): self.csv_img = self.csv_img.rename(columns={"gbifID": "gbifID_images"}) self.csv_img = self.csv_img.rename(columns={"identifier": "url"}) # print(self.csv_img.head(5)) combined = pd.merge(self.csv_img, self.csv_occ, left_on='gbifID_images', right_on='gbifID') # print(combined.head(5)) names_list = combined.apply(generate_image_filename, axis=1, result_type='expand') # print(names_list.head(5)) # Select columns 7, 0, 1 selected_columns = names_list.iloc[:,[7,0,1]] # Rename columns selected_columns.columns = ['fullname','filename_image','filename_image_jpg'] # print(selected_columns.head(5)) self.csv_combined = pd.concat([selected_columns, combined], axis=1) # print(self.csv_combined.head(5)) new_name = ''.join(['combined_', os.path.basename(self.path_csv_occ).split('.')[0], '_', os.path.basename(self.path_csv_img).split('.')[0], '.csv']) self.path_csv_combined = os.path.join(os.path.dirname(self.path_csv_occ), new_name) self.csv_combined.to_csv(self.path_csv_combined, mode='w', header=True, index=False) return self.path_csv_combined def __import_local_files(self, cfg, logger, Dirs): # Images if cfg['leafmachine']['project']['dir_images_local'] is None: self.dir_images = None else: self.dir_images = Dirs.save_original # CSV import # Combined try: if cfg['leafmachine']['project']['path_combined_csv_local'] is None: self.csv_combined = None self.path_csv_combined = None else: self.path_csv_combined = cfg['leafmachine']['project']['path_combined_csv_local'] self.csv_combined = import_csv(self.path_csv_combined) # Occurrence if cfg['leafmachine']['project']['path_occurrence_csv_local'] is None: self.csv_occ = None self.path_csv_occ = None else: self.path_csv_occ = cfg['leafmachine']['project']['path_occurrence_csv_local'] self.csv_occ = import_csv(self.path_csv_occ) # Images/metadata if cfg['leafmachine']['project']['path_images_csv_local'] is None: self.path_csv_img = None self.path_csv_img = None else: self.path_csv_img = cfg['leafmachine']['project']['path_images_csv_local'] self.csv_img = import_csv(self.path_csv_img) # Create combined if it's missing if self.csv_combined is None: if cfg['leafmachine']['project']['path_combined_csv_local'] is not None: # Print_Verbose(cfg, 2, 'Combined CSV file not provided, creating it now...').bold() logger.info('Combined CSV file not provided, creating it now...') location = self.__create_combined_csv() # Print_Verbose(cfg, 2, ''.join(['Combined CSV --> ',location])).green() logger.info(''.join(['Combined CSV --> ',location])) else: # Print_Verbose(cfg, 2, 'Combined CSV file not available or provided. Skipped record import.').bold() logger.info('Combined CSV file not available or provided. Skipped record import.') else: # Print_Verbose(cfg, 2, ''.join(['Combined CSV --> ',self.path_csv_combined])).green() logger.info(''.join(['Combined CSV --> ',self.path_csv_combined])) except: pass # Print_Verbose(cfg, 2, ''.join(['Image Directory --> ',self.dir_images])).green() logger.info(''.join(['Image Directory --> ',Dirs.save_original])) # def __import_GBIF_files_post_download(self, cfg, logger, dir_home): # # Download the images from GBIF # # This pulls from /LeafMachine2/configs/config_download_from_GBIF_all_images_in_file or filter # print_main_warn('Downloading Images from GBIF...') # logger.info('Downloading Images from GBIF...') # self.cfg_images = download_all_images_from_GBIF_LM2(dir_home, cfg['leafmachine']['project']['GBIF_mode']) # self.dir_images = self.cfg_images['dir_destination_images'] # self.path_csv = self.cfg_images['dir_destination_csv'] # print_main_success(''.join(['Images saved to --> ',self.dir_images])) # logger.info(''.join(['Images saved to --> ',self.dir_images])) # self.path_csv_combined = os.path.join(self.path_csv, self.cfg_images['filename_combined']) # self.path_csv_occ = os.path.join(self.path_csv, self.cfg_images['filename_occ']) # self.path_csv_img = os.path.join(self.path_csv, self.cfg_images['filename_img']) # if 'txt' in (self.cfg_images['filename_occ'].split('.')[1] or self.cfg_images['filename_img'].split('.')[1]): # self.csv_combined = import_tsv(self.path_csv_combined) # # self.csv_occ = import_tsv(self.path_csv_occ) # # self.csv_img = import_tsv(self.path_csv_img) # else: # self.csv_combined = import_csv(self.path_csv_combined) # # self.csv_occ = import_csv(self.path_csv_occ) # # self.csv_img = import_csv(self.path_csv_img) def process_in_batches(self, cfg): batch_size = cfg['leafmachine']['project']['batch_size'] self.project_data_list = [] keys = list(self.project_data.keys()) num_batches = len(keys) // batch_size + 1 for i in range(num_batches): start = i * batch_size end = (i + 1) * batch_size batch_keys = keys[start:end] batch = {key: self.project_data[key] for key in batch_keys} self.project_data_list.append(batch) return num_batches, len(self.project_data) # Original '''def __make_project_dict(self): self.project_data = {} for img in os.listdir(self.dir_images): if (img.endswith(".jpg") or img.endswith(".jpeg")): img_name = str(img.split('.')[0]) self.project_data[img_name] = {} ''' # def __make_project_dict(self): # This DELETES the invalid file, not safe # self.project_data = {} # for img in os.listdir(self.dir_images): # img_split, ext = os.path.splitext(img) # if ext.lower() in self.valid_extensions: # with Image.open(os.path.join(self.dir_images, img)) as im: # _, ext = os.path.splitext(img) # if ext != '.jpg': # im = im.convert('RGB') # im.save(os.path.join(self.dir_images, img_split) + '.jpg', quality=100) # img += '.jpg' # os.remove(os.path.join(self.dir_images, ''.join([img_split, ext]))) # img_name = os.path.splitext(img)[0] # self.project_data[img_split] = {} def __make_project_dict(self, Dirs): self.project_data = {} invalid_dir = None for img in os.listdir(Dirs.save_original): img_split, ext = os.path.splitext(img) if ext in self.valid_extensions: with Image.open(os.path.join(Dirs.save_original, img)) as im: _, ext = os.path.splitext(img) if ext not in ['.jpg']: im = im.convert('RGB') new_img_name = ''.join([img_split, '.jpg']) im.save(os.path.join(Dirs.save_original, new_img_name), quality=100) self.project_data[img_split] = {} # move the original file to the INVALID_FILE directory if invalid_dir is None: invalid_dir = os.path.join(os.path.dirname(Dirs.save_original), 'INVALID_FILES') os.makedirs(invalid_dir, exist_ok=True) # skip if the file already exists in the INVALID_FILE directory if not os.path.exists(os.path.join(invalid_dir, img)): shutil.move(os.path.join(Dirs.save_original, img), os.path.join(invalid_dir, img)) img = new_img_name img_name = os.path.splitext(img)[0] self.project_data[img_split] = {} else: # if the file has an invalid extension, move it to the INVALID_FILE directory if invalid_dir is None: invalid_dir = os.path.join(os.path.dirname(Dirs.save_original), 'INVALID_FILES') os.makedirs(invalid_dir, exist_ok=True) # skip if the file already exists in the INVALID_FILE directory if not os.path.exists(os.path.join(invalid_dir, img)): shutil.move(os.path.join(Dirs.save_original, img), os.path.join(invalid_dir, img)) def add_records_to_project_dict(self): for img in os.listdir(self.Dirs.save_original): if (img.endswith(".jpg") or img.endswith(".jpeg")): img_name = str(img.split('.')[0]) try: self.project_data[img_name]['GBIF_Record'] = self.__get_data_from_combined(img_name) except: self.project_data[img_name]['GBIF_Record'] = None def __get_data_from_combined(self, img_name): df = pd.DataFrame(self.csv_combined) row = df[df['filename_image'] == img_name].head(1).to_dict() return row class Project_Stats(): specimens = 0 rulers = 0 def __init__(self, cfg, logger, dir_home) -> None: logger.name = 'Project Info' logger.info("Gathering Images and Image Metadata")