llava-1-5 / utils_finetuning /imgs_checking_in_dataset.py
GDinesh's picture
Duplicate from saurabh-straive/llava-1-5
b2c8b1d verified
"""
To check the files (images), whether any of them has unusual size / dimension.
"""
import os
from PIL import Image
import pandas as pd
import argparse
from tqdm import tqdm
from typing import Union
def main(args):
dataset_path = args.dataset_path #"/workspace/llava_from_src/LLaVA/playground/data/images"
sizes = []
dimensions = []
fpaths = [] # to contain all the filenames (imagenames)
size_less_than_100X100 = [] # to contain 1 if size of the image < 100 * 100 else 0
for filename in tqdm(os.listdir(dataset_path)):
if filename.endswith(".jpg") or filename.endswith(".png"):
image_path = os.path.join(dataset_path, filename)
fpaths.append(image_path)
with Image.open(image_path) as img:
sizes.append(os.path.getsize(image_path))
dim = img.size
dimensions.append(img.size)
size_less_than_100X100.append((lambda dim: 1 if dim[0]*dim[1] < 10000 else 0)(dim))
if args.create_dataframe:
df = pd.DataFrame({
"fpath": fpaths,
"img_size": sizes,
"dimensions": dimensions,
"small_size": size_less_than_100X100
})
df.to_csv(args.create_dataframe, index=False)
print(f"Dataframe saved at {args.create_dataframe}.")
# Analyze the sizes and dimensions
# print("Max size:", max(sizes))
# print("Min size:", min(sizes))
# print("Avg size:", sum(sizes) / len(sizes))
# print("Unique dimensions:", set(dimensions))
print(pd.Series(sizes).describe())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset-path", type=str, required=True, help="Path of the dataset of images to be checked.")
parser.add_argument("--create-dataframe", type=str, default="report_imgs_size.csv", help="Name of the dataframe if you want to create.")
args = parser.parse_args()
main(args)