File size: 6,671 Bytes
90d1f68
07690ba
5fbdd3c
07690ba
 
 
 
 
 
 
 
 
 
 
5fbdd3c
07690ba
 
90d1f68
07690ba
 
5fbdd3c
 
 
 
 
07690ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a197a49
 
07690ba
 
 
 
5fbdd3c
a197a49
07690ba
 
 
5fbdd3c
07690ba
5fbdd3c
07690ba
 
 
 
 
 
 
 
 
5fbdd3c
 
07690ba
 
 
5fbdd3c
07690ba
 
 
 
 
5fbdd3c
07690ba
 
 
 
5fbdd3c
07690ba
 
5fbdd3c
07690ba
 
 
5fbdd3c
07690ba
 
5fbdd3c
07690ba
 
 
 
 
 
 
 
 
90d1f68
07690ba
 
 
90d1f68
07690ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7781f10
 
 
 
07690ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b309907
 
 
 
 
 
 
94ad7ba
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import re

import os
import signal
import logging
import sys
from time import sleep, time
from random import random, randint
from multiprocessing import JoinableQueue, Event, Process
from queue import Empty
from typing import Optional

logger = logging.getLogger(__name__)


def re_findall(pattern, string):
    return [m.groupdict() for m in re.finditer(pattern, string)]


class Task:
    def __init__(self, function, *args, **kwargs) -> None:
        self.function = function
        self.args = args
        self.kwargs = kwargs

    def run(self):
        return self.function(*self.args, **self.kwargs)


class CallbackGenerator:
    def __init__(self, generator, callback):
        self.generator = generator
        self.callback = callback

    def __iter__(self):
        if self.callback is not None and callable(self.callback):
            for t in self.generator:
                self.callback(t)
                yield t
        else:
            yield from self.generator


def start_worker(q: JoinableQueue, stop_event: Event):  # TODO make class?
    logger.info('Starting worker...')
    while True:
        if stop_event.is_set():
            logger.info('Worker exiting because of stop_event')
            break
        # We set a timeout so we loop past 'stop_event' even if the queue is empty
        try:
            task = q.get(timeout=.01)
        except Empty:
            # Run next iteration of loop
            continue

        # Exit if end of queue
        if task is None:
            logger.info('Worker exiting because of None on queue')
            q.task_done()
            break

        try:
            task.run()  # Do the task
        except:  # Will also catch KeyboardInterrupt
            logger.exception(f'Failed to process task {task}', )
            # Can implement some kind of retry handling here
        finally:
            q.task_done()


class InterruptibleTaskPool:

    # https://the-fonz.gitlab.io/posts/python-multiprocessing/
    def __init__(self,
                 tasks=None,
                 num_workers=None,

                 callback=None,  # Fired on start
                 max_queue_size=1,
                 grace_period=2,
                 kill_period=30,
                 ):

        self.tasks = CallbackGenerator(
            [] if tasks is None else tasks, callback)
        self.num_workers = os.cpu_count() if num_workers is None else num_workers

        self.max_queue_size = max_queue_size
        self.grace_period = grace_period
        self.kill_period = kill_period

        # The JoinableQueue has an internal counter that increments when an item is put on the queue and
        # decrements when q.task_done() is called. This allows us to wait until it's empty using .join()
        self.queue = JoinableQueue(maxsize=self.max_queue_size)
        # This is a process-safe version of the 'panic' variable shown above
        self.stop_event = Event()

        # n_workers: Start this many processes
        # max_queue_size: If queue exceeds this size, block when putting items on the queue
        # grace_period: Send SIGINT to processes if they don't exit within this time after SIGINT/SIGTERM
        # kill_period: Send SIGKILL to processes if they don't exit after this many seconds

        # self.on_task_complete = on_task_complete
        # self.raise_after_interrupt = raise_after_interrupt

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        pass

    def start(self) -> None:
        def handler(signalname):
            """
            Python 3.9 has `signal.strsignal(signalnum)` so this closure would not be needed.
            Also, 3.8 includes `signal.valid_signals()` that can be used to create a mapping for the same purpose.
            """
            def f(signal_received, frame):
                raise KeyboardInterrupt(f'{signalname} received')
            return f

        # This will be inherited by the child process if it is forked (not spawned)
        signal.signal(signal.SIGINT, handler('SIGINT'))
        signal.signal(signal.SIGTERM, handler('SIGTERM'))

        procs = []

        for i in range(self.num_workers):
            # Make it a daemon process so it is definitely terminated when this process exits,
            # might be overkill but is a nice feature. See
            # https://docs.python.org/3.8/library/multiprocessing.html#multiprocessing.Process.daemon
            p = Process(name=f'Worker-{i:02d}', daemon=True,
                        target=start_worker, args=(self.queue, self.stop_event))
            procs.append(p)
            p.start()

        try:
            # Put tasks on queue
            for task in self.tasks:
                logger.info(f'Put task {task} on queue')
                self.queue.put(task)

            # Put exit tasks on queue
            for i in range(self.num_workers):
                self.queue.put(None)

            # Wait until all tasks are processed
            self.queue.join()

        except KeyboardInterrupt:
            logger.warning('Caught KeyboardInterrupt! Setting stop event...')
            # raise # TODO add option
        finally:
            self.stop_event.set()
            t = time()
            # Send SIGINT if process doesn't exit quickly enough, and kill it as last resort
            # .is_alive() also implicitly joins the process (good practice in linux)
            while True:
                alive_procs = [p for p in procs if p.is_alive()]
                if not alive_procs:
                    break
                if time() > t + self.grace_period:
                    for p in alive_procs:
                        os.kill(p.pid, signal.SIGINT)
                        logger.warning(f'Sending SIGINT to {p}')
                elif time() > t + self.kill_period:
                    for p in alive_procs:
                        logger.warning(f'Sending SIGKILL to {p}')
                        # Queues and other inter-process communication primitives can break when
                        # process is killed, but we don't care here
                        p.kill()
                sleep(.01)

            sleep(.1)
            for p in procs:
                logger.info(f'Process status: {p}')


def jaccard(x1, x2, y1, y2):
    # Calculate jaccard index
    intersection = max(0, min(x2, y2)-max(x1, y1))
    filled_union = max(x2, y2) - min(x1, y1)
    return intersection/filled_union if filled_union > 0 else 0


def regex_search(text, pattern, group=1, default=None):
    match = re.search(pattern, text)
    return match.group(group) if match else default