File size: 6,338 Bytes
8381e8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import matplotlib.pyplot as plt
import os
import openslide
from PIL import Image
from openslide import OpenSlideError
from openslide.deepzoom import DeepZoomGenerator
import math
import random
from scipy.ndimage.morphology import binary_fill_holes
from skimage.color import rgb2gray
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk
from concurrent.futures import ProcessPoolExecutor
import tqdm

class SlideProcessor:
    def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30):
        self.tile_size = tile_size
        self.overlap = overlap
        self.tissue_threshold = tissue_threshold
        self.max_workers = max_workers

    def optical_density(self, tile):
        tile = tile.astype(np.float64)
        od = -np.log((tile+1)/240)
        return od

    def keep_tile(self, tile, tissue_threshold=None):
        if tissue_threshold is None:
            tissue_threshold = self.tissue_threshold
            
        if tile.shape[0:2] == (self.tile_size, self.tile_size):
            tile_orig = tile
            tile = rgb2gray(tile)
            tile = 1 - tile
            tile = canny(tile)
            tile = binary_closing(tile, disk(10))
            tile = binary_dilation(tile, disk(10))
            tile = binary_fill_holes(tile)
            percentage = tile.mean()

            check1 = percentage >= tissue_threshold

            tile = self.optical_density(tile_orig)
            beta = 0.15
            tile = np.min(tile, axis=2) >= beta
            tile = binary_closing(tile, disk(2))
            tile = binary_dilation(tile, disk(2))
            tile = binary_fill_holes(tile)
            percentage = tile.mean()

            check2 = percentage >= tissue_threshold

            return check1 and check2
        else:
            return False
        
    def filter_tiles(self, tile_indices, generator):
        def process_tile(tile_index):
            tile_size, overlap, zoom_level, col, row = tile_index
            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
            if self.keep_tile(tile, self.tissue_threshold):
                return col, row
            return None

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = executor.map(process_tile, tile_indices)
        
        # Filter out None results and return the list of tiles to keep
        return [result for result in results if result is not None]


    def get_tiles(self, samples, tile_indices, generator):
        tiles = []
        for i in samples:
            tile_size, overlap, zoom_level, col, row = tile_indices[i]
            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
            tiles.append((i, tile))
        return tiles
    
    def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'):
        for sample in sample_tiles:
            i, tile = sample
            im = Image.fromarray(tile)
            fname = f"{slide_num}_{i}"
            file_path = os.path.join(loc, f"{fname}.jpeg")
            im.save(file_path)

    def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc):

        def save_tile(cord):
            x, y = cord
            tile_index = next((ti for ti in tile_indices if ti[3] == x and ti[4] == y), None)
            if tile_index:
                tile_size, overlap, zoom_level, col, row = tile_index
                tile = np.asarray(generator.get_tile(zoom_level, (x, y)))
                im = Image.fromarray(tile)
                fname = f"{slide_num}_{x}_{y}"
                file_path = os.path.join(loc, f"{fname}.jpeg")
                im.save(file_path)

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            executor.map(save_tile, samples)

    def process_one_slide(self, file_loc, output_dir=None):
        f2p = file_loc
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        img1 = openslide.open_slide(f2p) 
        generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True)
        highest_zoom_level = generator.level_count - 1

        try:
            mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
            offset = math.floor((mag / 20) / 2)
            level = highest_zoom_level - offset
        except (ValueError, KeyError):
            level = highest_zoom_level

        zoom_level = level
        cols, rows = generator.level_tiles[zoom_level]
        tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)]
        
        filtered_tiles = self.filter_tiles(tile_indices, generator)
        #np.save(filter_sname, filtered_tiles)
        if file_loc.endswith('.svs'):
            file = file_loc[-16:-4]
            print(file)
            
        directory = os.path.join(output_dir, file)
        if not os.path.exists(directory):
            os.makedirs(directory)

        existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')])
        
        filtered_tiles_count = len(filtered_tiles)
        threshold = 5 
        if abs(existing_files_count - filtered_tiles_count) <= threshold:
            print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.")
        else:
            print('Now going to save tiles') 
            self.get_save_tiles(filtered_tiles, tile_indices, file, generator,file, directory)
            #np.save(directory, filtered_tiles)
        
        return file

    def parallel_process(self, base_dir='HNSC_DS', output_dir=None):
        # List all .svs files in the base directory
        files = [os.path.join(base_dir, f) for f in os.listdir(base_dir) if f.endswith('.svs')]

        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
            # Use executor.map to process each file. No need to repeat base_dir and output_dir as they are now constant for all files
            results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [output_dir]*len(files)), total=len(files)))
        
        return results