import numpy as np import tensorflow as tf import pandas as pd import matplotlib.pyplot as plt import os import openslide from PIL import Image from openslide import OpenSlideError from openslide.deepzoom import DeepZoomGenerator import math import random from pyspark.ml.linalg import Vectors import pyspark.sql.functions as F from scipy.ndimage.morphology import binary_fill_holes from skimage.color import rgb2gray from skimage.feature import canny from skimage.morphology import binary_closing, binary_dilation, disk from concurrent.futures import ProcessPoolExecutor import tqdm class SlideProcessor: def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30): self.tile_size = tile_size self.overlap = overlap self.tissue_threshold = tissue_threshold self.max_workers = max_workers def optical_density(self, tile): tile = tile.astype(np.float64) od = -np.log((tile+1)/240) return od def keep_tile(self, tile, tissue_threshold=None): if tissue_threshold is None: tissue_threshold = self.tissue_threshold if tile.shape[0:2] == (self.tile_size, self.tile_size): tile_orig = tile tile = rgb2gray(tile) tile = 1 - tile tile = canny(tile) tile = binary_closing(tile, disk(10)) tile = binary_dilation(tile, disk(10)) tile = binary_fill_holes(tile) percentage = tile.mean() check1 = percentage >= tissue_threshold tile = self.optical_density(tile_orig) beta = 0.15 tile = np.min(tile, axis=2) >= beta tile = binary_closing(tile, disk(2)) tile = binary_dilation(tile, disk(2)) tile = binary_fill_holes(tile) percentage = tile.mean() check2 = percentage >= tissue_threshold return check1 and check2 else: return False def filter_tiles(self, tile_indices, generator): filtered_tiles = [] for i in range(len(tile_indices)): tile_size, overlap, zoom_level, col, row = tile_indices[i] tile = np.asarray(generator.get_tile(zoom_level, (col, row))) if self.keep_tile(tile, self.tissue_threshold): filtered_tiles.append((col, row)) return filtered_tiles def get_tiles(self, samples, tile_indices, generator): tiles = [] for i in samples: tile_size, overlap, zoom_level, col, row = tile_indices[i] tile = np.asarray(generator.get_tile(zoom_level, (col, row))) tiles.append((i, tile)) return tiles def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'): for sample in sample_tiles: i, tile = sample im = Image.fromarray(tile) fname = f"{slide_num}_{i}" file_path = os.path.join(loc, f"{fname}.jpeg") im.save(file_path) def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc=None): if loc is None: loc = f'/home/gp7/ml_pni/Dataset/tiles_1024/{file}' for i, cord in enumerate(samples): x, y = cord tile_size, overlap, zoom_level, col, row = tile_indices[i] tile = np.asarray(generator.get_tile(zoom_level, (x, y))) im = Image.fromarray(tile) fname = f"{slide_num}_{x}_{y}" file_path = os.path.join(loc, f"{fname}.jpeg") im.save(file_path) def process_one_slide(self, file, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'): f2p = os.path.join(base_dir, f'{file}.svs') if not os.path.exists(output_dir): os.makedirs(output_dir) img1 = openslide.open_slide(f2p) generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True) highest_zoom_level = generator.level_count - 1 try: mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER]) offset = math.floor((mag / 20) / 2) level = highest_zoom_level - offset except (ValueError, KeyError): level = highest_zoom_level zoom_level = level cols, rows = generator.level_tiles[zoom_level] tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)] filter_sname = os.path.join(output_dir, f'{file}_info.npy') if os.path.exists(filter_sname): try: filtered_tiles = np.load(filter_sname) print(f"Found existing filtered tiles for {file}, skipping tile filtering.") except: print(f"Error reading {filter_sname}, re-filtering tiles.") filtered_tiles = self.filter_tiles(tile_indices, generator) np.save(filter_sname, filtered_tiles) else: print(f"Didn't find existing filtered tiles for {file}, filtering tiles.") filtered_tiles = self.filter_tiles(tile_indices, generator) np.save(filter_sname, filtered_tiles) directory = os.path.join(output_dir, file) if not os.path.exists(directory): os.makedirs(directory) existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')]) filtered_tiles_count = len(filtered_tiles) threshold = 5 if abs(existing_files_count - filtered_tiles_count) <= threshold: print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.") else: print('Now going to save tiles') self.get_save_tiles(filtered_tiles, tile_indices, file, generator, directory) return file def parallel_process(self, files, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'): with ProcessPoolExecutor(max_workers=self.max_workers) as executor: results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [base_dir]*len(files), [output_dir]*len(files)), total=len(files))) return results