Spaces:

bgsys
/

background-removal-arena

Running

App Files Files Community

tdurbor commited on Dec 10, 2024

Commit

fe62bf5

1 Parent(s): 892f774

Consolidate image preparation pipeline

Browse files

Files changed (9) hide show

image_processing_pipeline.py +118 -0
utils/add_green_background.py +27 -17
utils/bria_rmbg20.py +1 -1
utils/photoroom.py +2 -2
utils/remove_backgrounds.py +0 -66
utils/removebg.py +1 -1
utils/resize_images.py +21 -3
utils/resize_processed_images.py +41 -14
utils/upload-to-dataset.py +0 -84

image_processing_pipeline.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import argparse
+import shutil
+import sys
+from dotenv import load_dotenv, find_dotenv
+# Importing modules from the utils package
+from utils.resize_images import main as resize_images_main
+from utils.removebg import iterate_over_directory as removebg_iterate
+from utils.photoroom import iterate_over_directory as photoroom_iterate
+from utils.bria_rmbg20 import iterate_over_directory as bria_iterate
+from utils.add_green_background import process_directory as add_green_background_process
+from utils.upload_to_dataset import upload_to_dataset
+from utils.resize_processed_images import process_images
+def check_env_variables():
+    """Check if the necessary environment variables are loaded."""
+    if not find_dotenv():
+        sys.exit("Error: .env file not found.")
+    load_dotenv()
+    required_keys = ['REMOVEBG_API_KEY', 'PHOTOROOM_API_KEY', 'BRIA_API_TOKEN']
+    missing_keys = [key for key in required_keys if not os.getenv(key)]
+    if missing_keys:
+        sys.exit(f"Error: Missing environment variables: {', '.join(missing_keys)}")
+def copy_images(source_dir, dest_dir):
+    os.makedirs(dest_dir, exist_ok=True)
+    valid_extensions = ('.png', '.jpg', '.jpeg')
+    # Walk through the source directory
+    for root, _, files in os.walk(source_dir):
+        for filename in files:
+            if filename.lower().endswith(valid_extensions):
+                source_file = os.path.join(root, filename)
+                # Extract the folder name
+                folder_name = os.path.basename(root)
+                # Append folder name to the filename
+                new_filename = f"{folder_name}_{filename}"
+                dest_file = os.path.join(dest_dir, new_filename)
+                # Check if the file is an image and doesn't already exist in the destination
+                if os.path.isfile(source_file) and not os.path.exists(dest_file):
+                    shutil.copy2(source_file, dest_file)
+                    print(f"Copied: {new_filename}")
+                else:
+                    print(f"Skipped: {filename} (already exists or not a file)")
+def main():
+    check_env_variables()
+    parser = argparse.ArgumentParser(description="Image Processing Pipeline")
+    parser.add_argument("--input-dir", type=str, default="original-images", help="Input directory for images")
+    parser.add_argument("--work-dir", type=str, default="workdir", help="Working directory for intermediate images")
+    parser.add_argument("--output-dir", type=str, default="final-images", help="Output directory for final images")
+    parser.add_argument("--dataset-name", type=str, help="Name of the dataset to upload to Hugging Face Hub")
+    parser.add_argument("--push-dataset", action="store_true", help="Push the dataset to the Hugging Face Hub")
+    args = parser.parse_args()
+    # Define intermediate directories within the work directory
+    input_resized_dir = os.path.join(args.work_dir, "resized")
+    bg_removed_dir = os.path.join(args.work_dir, "background-removed")
+    green_bg_dir = os.path.join(args.work_dir, "green-background")
+    # Ensure all directories exist
+    for directory in [input_resized_dir, bg_removed_dir, green_bg_dir]:
+        os.makedirs(directory, exist_ok=True)
+    # Step 4: Move images to final output directory
+    print("Moving images to final output directory...")
+    original_images_dir = os.path.join(args.work_dir, "merged-categories")
+    copy_images(args.input_dir, original_images_dir)
+    # Step 1: Resize images
+    print("Resizing images...")
+    resize_images_main(input_directory=original_images_dir, output_directory=input_resized_dir)
+    # Step 2: Remove background
+    print("Removing backgrounds...")
+    bg_removal_dirs = {
+        "removebg": os.path.join(bg_removed_dir, "removebg"),
+        "photoroom": os.path.join(bg_removed_dir, "photoroom"),
+        "bria": os.path.join(bg_removed_dir, "bria")
+    }
+    for dir_path in bg_removal_dirs.values():
+        os.makedirs(dir_path, exist_ok=True)
+    removebg_iterate(input_resized_dir, bg_removal_dirs["removebg"])
+    photoroom_iterate(input_resized_dir, bg_removal_dirs["photoroom"])
+    bria_iterate(input_resized_dir, bg_removal_dirs["bria"])
+    print("Adding green background...")
+    add_green_background_process(bg_removed_dir, green_bg_dir)
+    print("Resizing processed images...")
+    target_width = 800
+    subdirectories = ["bria", "photoroom", "clipdrop", "removebg"]
+    os.makedirs(args.output_dir, exist_ok=True)
+    for subdir in subdirectories:
+        input_directory = os.path.join(green_bg_dir, subdir)
+        output_directory = os.path.join(args.output_dir, subdir)
+        process_images(input_directory, output_directory, target_width)
+    original_output_directory = os.path.join(args.output_dir, "web-original-images")
+    process_images(original_images_dir, original_output_directory, target_width)
+    if args.dataset_name:
+        upload_to_dataset(original_output_directory, args.output_dir, args.dataset_name, dry_run=not args.push_dataset)
+    else:
+        print("Please provide a dataset name using --dataset-name")
+if __name__ == "__main__":
+    main()

utils/add_green_background.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from PIL import Image
 def add_green_background_to_image(image_path, output_path, background_color=(0, 255, 0)):
     """Add a green background to an image and save it as PNG."""
@@ -9,28 +10,37 @@ def add_green_background_to_image(image_path, output_path, background_color=(0,
         combined = Image.alpha_composite(background, img)
         combined.save(output_path, "PNG")
 def process_directory(input_dir, output_dir, background_color=(0, 255, 0)):
     """Recursively process a directory to add a green background to all images and convert them to PNG."""
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    for root, _, files in os.walk(input_dir):
-        for file in files:
-            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
-                input_path = os.path.join(root, file)
-                relative_path = os.path.relpath(input_path, input_dir)
-                output_path = os.path.join(output_dir, os.path.splitext(relative_path)[0] + '.png')
-                # Ensure the output directory exists
-                os.makedirs(os.path.dirname(output_path), exist_ok=True)
-                # Check if the output file already exists
-                if not os.path.exists(output_path):
-                    # Add green background to the image and convert to PNG
-                    add_green_background_to_image(input_path, output_path, background_color)
-                    print(f"Processed: {input_path} -> {output_path}")
-                else:
-                    print(f"Skipped: {output_path} already exists")
 # Example usage
 input_directory = "../../background-removal-arena-v0/train/data/resized"

 import os
 from PIL import Image
+from concurrent.futures import ThreadPoolExecutor
 def add_green_background_to_image(image_path, output_path, background_color=(0, 255, 0)):
     """Add a green background to an image and save it as PNG."""
         combined = Image.alpha_composite(background, img)
         combined.save(output_path, "PNG")
+def process_image_file(input_path, output_path, background_color):
+    """Process a single image file to add a green background."""
+    if not os.path.exists(output_path):
+        add_green_background_to_image(input_path, output_path, background_color)
+        print(f"Processed: {input_path} -> {output_path}")
+    else:
+        print(f"Skipped: {output_path} already exists")
 def process_directory(input_dir, output_dir, background_color=(0, 255, 0)):
     """Recursively process a directory to add a green background to all images and convert them to PNG."""
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
+    tasks = []
+    with ThreadPoolExecutor() as executor:
+        for root, _, files in os.walk(input_dir):
+            for file in files:
+                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
+                    input_path = os.path.join(root, file)
+                    relative_path = os.path.relpath(input_path, input_dir)
+                    output_path = os.path.join(output_dir, os.path.splitext(relative_path)[0] + '.png')
+                    # Ensure the output directory exists
+                    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                    # Submit the task to the executor
+                    tasks.append(executor.submit(process_image_file, input_path, output_path, background_color))
+    # Wait for all tasks to complete
+    for task in tasks:
+        task.result()
 # Example usage
 input_directory = "../../background-removal-arena-v0/train/data/resized"

utils/bria_rmbg20.py CHANGED Viewed

@@ -51,7 +51,7 @@ def iterate_over_directory(directory_path, result_directory):
                     file_path = os.path.join(root, file)
                     result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
-                    result_file_directory = os.path.join(result_directory, os.path.basename(root))
                     if not os.path.exists(result_file_directory):
                         os.makedirs(result_file_directory)

                     file_path = os.path.join(root, file)
                     result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
+                    result_file_directory = os.path.join(result_directory)
                     if not os.path.exists(result_file_directory):
                         os.makedirs(result_file_directory)

utils/photoroom.py CHANGED Viewed

@@ -41,8 +41,8 @@ def iterate_over_directory(directory_path, result_directory):
                     file_path = os.path.join(root, file)
                     result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
-                    result_file_directory = os.path.join(result_directory, os.path.basename(root))
                     if not os.path.exists(result_file_directory):
                         os.makedirs(result_file_directory)

                     file_path = os.path.join(root, file)
                     result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
+                    result_file_directory = os.path.join(result_directory)
                     if not os.path.exists(result_file_directory):
                         os.makedirs(result_file_directory)

utils/remove_backgrounds.py DELETED Viewed

@@ -1,66 +0,0 @@
-import os
-from photoroom import process_image as photoroom_process
-from removebg import process_image as removebg_process
-#from clipdrop import process_image as clipdrop_process
-from bria_rmbg20 import process_image as bria_process
-def create_directory(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-def process_images(input_directory, output_directory, process_function, limit=None):
-    count = 0
-    for root, _, files in os.walk(input_directory):
-        for file in files:
-            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.webp', '.heic')):
-                file_path = os.path.join(root, file)
-                result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
-                result_file_directory = os.path.join(output_directory)
-                if not os.path.exists(result_file_directory):
-                    os.makedirs(result_file_directory)
-                result_path = os.path.join(result_file_directory, result_file_name)
-                if not os.path.exists(result_path):  # Check if the image has already been processed
-                    print(file_path, result_path)
-                    process_function(file_path, result_path)
-                    count += 1
-                    if limit and count >= limit:
-                        return
-def main(dry_run=False):
-    input_directory = "../data/resized-original-images"
-    output_base_directory = "../data/processed"
-    # Define output directories for each API
-    output_directories = {
-        "photoroom": os.path.join(output_base_directory, "photoroom"),
-        "removebg": os.path.join(output_base_directory, "removebg"),
-        #"clipdrop": os.path.join(output_base_directory, "clipdrop"),
-        "bria": os.path.join(output_base_directory, "bria")
-    }
-    # Create output directories if they don't exist
-    for directory in output_directories.values():
-        create_directory(directory)
-    if dry_run:
-        print("Starting dry run...")
-        k = 5
-        process_images(input_directory, output_directories["photoroom"], photoroom_process, limit=k)
-        process_images(input_directory, output_directories["removebg"], removebg_process, limit=k)
-        #process_images(input_directory, output_directories["clipdrop"], clipdrop_process, limit=k)
-        process_images(input_directory, output_directories["bria"], bria_process, limit=k)
-        print("Dry run completed.")
-    else:
-        print("Starting full processing...")
-        process_images(input_directory, output_directories["photoroom"], photoroom_process)
-        process_images(input_directory, output_directories["removebg"], removebg_process)
-        #process_images(input_directory, output_directories["clipdrop"], clipdrop_process)
-        process_images(input_directory, output_directories["bria"], bria_process)
-        print("Full processing completed.")
-if __name__ == "__main__":
-    # Set dry_run to True for a dry run, or False for full processing
-    main(dry_run=False)

utils/removebg.py CHANGED Viewed

@@ -41,7 +41,7 @@ def iterate_over_directory(directory_path, result_directory):
                 file_path = os.path.join(root, file)
                 result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
-                result_file_directory = os.path.join(result_directory, os.path.basename(root))
                 if not os.path.exists(result_file_directory):
                     os.makedirs(result_file_directory)

                 file_path = os.path.join(root, file)
                 result_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.png'
+                result_file_directory = os.path.join(result_directory)
                 if not os.path.exists(result_file_directory):
                     os.makedirs(result_file_directory)

utils/resize_images.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from PIL import Image
 import concurrent.futures
 # Define the directories
@@ -11,6 +11,24 @@ os.makedirs(output_directory, exist_ok=True)
 def resize_image(input_path, output_path):
     with Image.open(input_path) as img:
         # Calculate the current megapixels
         current_megapixels = (img.width * img.height) / 1_000_000
         max_megapixels = 10
@@ -27,7 +45,7 @@ def resize_image(input_path, output_path):
             # If the image is smaller than 10 megapixels, save it as is
             img.save(output_path)
-def main():
     # Iterate over the input directory
     with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
         for filename in os.listdir(input_directory):
@@ -44,4 +62,4 @@ def main():
     print("All images have been resized and saved to the output directory.")
 if __name__ == "__main__":
-    main()

 import os
+from PIL import Image, ExifTags
 import concurrent.futures
 # Define the directories
 def resize_image(input_path, output_path):
     with Image.open(input_path) as img:
+        # Correct image orientation using EXIF data
+        try:
+            for orientation in ExifTags.TAGS.keys():
+                if ExifTags.TAGS[orientation] == 'Orientation':
+                    break
+            exif = img._getexif()
+            if exif is not None:
+                orientation = exif.get(orientation, None)
+                if orientation == 3:
+                    img = img.rotate(180, expand=True)
+                elif orientation == 6:
+                    img = img.rotate(270, expand=True)
+                elif orientation == 8:
+                    img = img.rotate(90, expand=True)
+        except (AttributeError, KeyError, IndexError):
+            # Cases: image don't have getexif
+            pass
         # Calculate the current megapixels
         current_megapixels = (img.width * img.height) / 1_000_000
         max_megapixels = 10
             # If the image is smaller than 10 megapixels, save it as is
             img.save(output_path)
+def main(input_directory, output_directory):
     # Iterate over the input directory
     with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
         for filename in os.listdir(input_directory):
     print("All images have been resized and saved to the output directory.")
 if __name__ == "__main__":
+    main(input_directory, output_directory)

utils/resize_processed_images.py CHANGED Viewed

@@ -1,13 +1,37 @@
-from PIL import Image
 import os
 def create_directory(path):
     """Create a directory if it doesn't exist."""
     os.makedirs(path, exist_ok=True)
 def resize_image(input_path, output_path, target_width):
     """Resize an image to the target width while maintaining aspect ratio."""
     with Image.open(input_path) as img:
         # Calculate the new height to maintain the aspect ratio
         width_percent = target_width / img.width
         target_height = int(img.height * width_percent)
@@ -18,23 +42,26 @@ def resize_image(input_path, output_path, target_width):
         # Save the resized image in the same format as the input
         img.save(output_path, format=img.format)
 def process_images(input_directory, output_directory, target_width):
     """Process and resize images from the input directory to the output directory."""
     create_directory(output_directory)
-    for root, _, files in os.walk(input_directory):
-        for file in files:
-            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.webp', '.heic')):
-                file_path = os.path.join(root, file)
-                result_file_name = os.path.splitext(file)[0] + os.path.splitext(file)[1]
-                result_path = os.path.join(output_directory, result_file_name)
-                # Check if the output file already exists
-                if not os.path.exists(result_path):
-                    print(f"Resizing {file_path} to {result_path}")
-                    resize_image(file_path, result_path, target_width)
-                else:
-                    print(f"Skipped {file_path}, already resized.")
 def main():
     """Main function to resize images in specified subdirectories."""

+from PIL import Image, ExifTags
 import os
+from concurrent.futures import ThreadPoolExecutor
 def create_directory(path):
     """Create a directory if it doesn't exist."""
     os.makedirs(path, exist_ok=True)
+def correct_orientation(img):
+    """Correct image orientation using EXIF data."""
+    try:
+        for orientation in ExifTags.TAGS.keys():
+            if ExifTags.TAGS[orientation] == 'Orientation':
+                break
+        exif = img._getexif()
+        if exif is not None:
+            orientation = exif.get(orientation, None)
+            if orientation == 3:
+                img = img.rotate(180, expand=True)
+            elif orientation == 6:
+                img = img.rotate(270, expand=True)
+            elif orientation == 8:
+                img = img.rotate(90, expand=True)
+    except (AttributeError, KeyError, IndexError):
+        # Cases: image doesn't have getexif
+        pass
+    return img
 def resize_image(input_path, output_path, target_width):
     """Resize an image to the target width while maintaining aspect ratio."""
     with Image.open(input_path) as img:
+        # Correct orientation
+        img = correct_orientation(img)
         # Calculate the new height to maintain the aspect ratio
         width_percent = target_width / img.width
         target_height = int(img.height * width_percent)
         # Save the resized image in the same format as the input
         img.save(output_path, format=img.format)
+def process_image_file(file_path, result_path, target_width):
+    """Process a single image file."""
+    if not os.path.exists(result_path):
+        print(f"Resizing {file_path} to {result_path}")
+        resize_image(file_path, result_path, target_width)
+    else:
+        print(f"Skipped {file_path}, already resized.")
 def process_images(input_directory, output_directory, target_width):
     """Process and resize images from the input directory to the output directory."""
     create_directory(output_directory)
+    with ThreadPoolExecutor() as executor:
+        for root, _, files in os.walk(input_directory):
+            for file in files:
+                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
+                    file_path = os.path.join(root, file)
+                    result_file_name = os.path.splitext(file)[0] + os.path.splitext(file)[1]
+                    result_path = os.path.join(output_directory, result_file_name)
+                    executor.submit(process_image_file, file_path, result_path, target_width)
 def main():
     """Main function to resize images in specified subdirectories."""

utils/upload-to-dataset.py DELETED Viewed

@@ -1,84 +0,0 @@
-from datasets import Dataset, Features, Value, Image
-from huggingface_hub import HfApi
-import os
-from collections import defaultdict
-import pandas as pd
-import argparse
-def upload_to_dataset(image_dir, dataset_name):
-    # Define the dataset features with dedicated columns for each model
-    features = Features({
-        "original_image": Image(),  # Original image feature
-        "clipdrop_image": Image(),  # Clipdrop segmented image
-        "bria_image": Image(),      # Bria segmented image
-        "photoroom_image": Image(), # Photoroom segmented image
-        "removebg_image": Image(),  # RemoveBG segmented image
-        "original_filename": Value("string")  # Original filename
-    })
-    # Load image paths and metadata
-    data = defaultdict(lambda: {
-        "clipdrop_image": None,
-        "bria_image": None,
-        "photoroom_image": None,
-        "removebg_image": None
-    })
-    # Walk into the web-original-images folder
-    web_original_images_dir = os.path.join(image_dir, "web-original-images")
-    for root, _, files in os.walk(web_original_images_dir):
-        for f in files:
-            if f.endswith(('.png', '.jpg', '.jpeg')):
-                original_image_path = os.path.join(root, f)
-                data[f]["original_image"] = original_image_path
-                data[f]["original_filename"] = f
-                # Check for corresponding images in other directories
-                for source in ["clipdrop", "bria", "photoroom", "removebg"]:
-                    # Check for processed images ending in .png or .jpg
-                    for ext in ['.png', '.jpg']:
-                        processed_image_filename = os.path.splitext(f)[0] + ext
-                        source_image_path = os.path.join(image_dir, source, processed_image_filename)
-                        if os.path.exists(source_image_path):
-                            data[f][f"{source}_image"] = source_image_path
-                            break  # Stop checking other extensions if a file is found
-    # Convert the data to a dictionary of lists
-    dataset_dict = {
-        "original_image": [],
-        "clipdrop_image": [],
-        "bria_image": [],
-        "photoroom_image": [],
-        "removebg_image": [],
-        "original_filename": []
-    }
-    for filename, entry in data.items():
-        if "original_image" in entry:
-            dataset_dict["original_image"].append(entry["original_image"])
-            dataset_dict["clipdrop_image"].append(entry["clipdrop_image"])
-            dataset_dict["bria_image"].append(entry["bria_image"])
-            dataset_dict["photoroom_image"].append(entry["photoroom_image"])
-            dataset_dict["removebg_image"].append(entry["removebg_image"])
-            dataset_dict["original_filename"].append(filename)
-    # Save the data dictionary to a CSV file for inspection
-    df = pd.DataFrame.from_dict(dataset_dict)
-    df.to_csv("image_data.csv", index=False)
-    # Create a Dataset
-    dataset = Dataset.from_dict(dataset_dict, features=features)
-    # Push the dataset to Hugging Face Hub
-    api = HfApi()
-    dataset.push_to_hub(dataset_name, token=api.token)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Upload images to a Hugging Face dataset.")
-    parser.add_argument("image_dir", type=str, help="Directory containing the images.")
-    parser.add_argument("dataset_name", type=str, help="Name of the dataset to upload to Hugging Face Hub.")
-    args = parser.parse_args()
-    upload_to_dataset(args.image_dir, args.dataset_name)