omics-plip-1 / scripts /slide_processor.py
VatsalPatel18's picture
Upload 8 files
8381e8e verified
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import openslide
from PIL import Image
from openslide import OpenSlideError
from openslide.deepzoom import DeepZoomGenerator
import math
import random
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as F
from scipy.ndimage.morphology import binary_fill_holes
from skimage.color import rgb2gray
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk
from concurrent.futures import ProcessPoolExecutor
import tqdm
class SlideProcessor:
def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30):
self.tile_size = tile_size
self.overlap = overlap
self.tissue_threshold = tissue_threshold
self.max_workers = max_workers
def optical_density(self, tile):
tile = tile.astype(np.float64)
od = -np.log((tile+1)/240)
return od
def keep_tile(self, tile, tissue_threshold=None):
if tissue_threshold is None:
tissue_threshold = self.tissue_threshold
if tile.shape[0:2] == (self.tile_size, self.tile_size):
tile_orig = tile
tile = rgb2gray(tile)
tile = 1 - tile
tile = canny(tile)
tile = binary_closing(tile, disk(10))
tile = binary_dilation(tile, disk(10))
tile = binary_fill_holes(tile)
percentage = tile.mean()
check1 = percentage >= tissue_threshold
tile = self.optical_density(tile_orig)
beta = 0.15
tile = np.min(tile, axis=2) >= beta
tile = binary_closing(tile, disk(2))
tile = binary_dilation(tile, disk(2))
tile = binary_fill_holes(tile)
percentage = tile.mean()
check2 = percentage >= tissue_threshold
return check1 and check2
else:
return False
def filter_tiles(self, tile_indices, generator):
filtered_tiles = []
for i in range(len(tile_indices)):
tile_size, overlap, zoom_level, col, row = tile_indices[i]
tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
if self.keep_tile(tile, self.tissue_threshold):
filtered_tiles.append((col, row))
return filtered_tiles
def get_tiles(self, samples, tile_indices, generator):
tiles = []
for i in samples:
tile_size, overlap, zoom_level, col, row = tile_indices[i]
tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
tiles.append((i, tile))
return tiles
def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'):
for sample in sample_tiles:
i, tile = sample
im = Image.fromarray(tile)
fname = f"{slide_num}_{i}"
file_path = os.path.join(loc, f"{fname}.jpeg")
im.save(file_path)
def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc=None):
if loc is None:
loc = f'/home/gp7/ml_pni/Dataset/tiles_1024/{file}'
for i, cord in enumerate(samples):
x, y = cord
tile_size, overlap, zoom_level, col, row = tile_indices[i]
tile = np.asarray(generator.get_tile(zoom_level, (x, y)))
im = Image.fromarray(tile)
fname = f"{slide_num}_{x}_{y}"
file_path = os.path.join(loc, f"{fname}.jpeg")
im.save(file_path)
def process_one_slide(self, file, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
f2p = os.path.join(base_dir, f'{file}.svs')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
img1 = openslide.open_slide(f2p)
generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True)
highest_zoom_level = generator.level_count - 1
try:
mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
offset = math.floor((mag / 20) / 2)
level = highest_zoom_level - offset
except (ValueError, KeyError):
level = highest_zoom_level
zoom_level = level
cols, rows = generator.level_tiles[zoom_level]
tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)]
filter_sname = os.path.join(output_dir, f'{file}_info.npy')
if os.path.exists(filter_sname):
try:
filtered_tiles = np.load(filter_sname)
print(f"Found existing filtered tiles for {file}, skipping tile filtering.")
except:
print(f"Error reading {filter_sname}, re-filtering tiles.")
filtered_tiles = self.filter_tiles(tile_indices, generator)
np.save(filter_sname, filtered_tiles)
else:
print(f"Didn't find existing filtered tiles for {file}, filtering tiles.")
filtered_tiles = self.filter_tiles(tile_indices, generator)
np.save(filter_sname, filtered_tiles)
directory = os.path.join(output_dir, file)
if not os.path.exists(directory):
os.makedirs(directory)
existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')])
filtered_tiles_count = len(filtered_tiles)
threshold = 5
if abs(existing_files_count - filtered_tiles_count) <= threshold:
print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.")
else:
print('Now going to save tiles')
self.get_save_tiles(filtered_tiles, tile_indices, file, generator, directory)
return file
def parallel_process(self, files, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [base_dir]*len(files), [output_dir]*len(files)), total=len(files)))
return results