File size: 2,718 Bytes
a3290d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import subprocess

from keras import Model

# from keras.utils import multi_gpu_model
# from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model


def get_available_gpus(num_gpus: int = None):
    """Get gpu ids for gpus that are >95% free.

    Tensorflow does not support checking free memory on gpus.
    This is a crude method that relies on `nvidia-smi` to
    determine which gpus are occupied and which are free.

    Args:
        num_gpus: Number of requested gpus. If not specified,
            ids of all available gpu(s) are returned.

    Returns:
        List[int]: List of gpu ids that are free. Length
            will equal `num_gpus`, if specified.
    """
    # Built-in tensorflow gpu id.
    assert isinstance(num_gpus, (type(None), int))
    if num_gpus == 0:
        return [-1]

    num_requested_gpus = num_gpus
    try:
        num_gpus = (
            len(
                subprocess.check_output("nvidia-smi --list-gpus", shell=True)
                .decode()
                .split("\n")
            )
            - 1
        )

        out_str = subprocess.check_output("nvidia-smi | grep MiB", shell=True).decode()
    except subprocess.CalledProcessError:
        return None
    mem_str = [x for x in out_str.split() if "MiB" in x]
    # First 2 * num_gpu elements correspond to memory for gpus
    # Order: (occupied-0, total-0, occupied-1, total-1, ...)
    mems = [float(x[:-3]) for x in mem_str]
    gpu_percent_occupied_mem = [
        mems[2 * gpu_id] / mems[2 * gpu_id + 1] for gpu_id in range(num_gpus)
    ]

    available_gpus = [
        gpu_id for gpu_id, mem in enumerate(gpu_percent_occupied_mem) if mem < 0.05
    ]
    if num_requested_gpus and num_requested_gpus > len(available_gpus):
        raise ValueError(
            "Requested {} gpus, only {} are free".format(
                num_requested_gpus, len(available_gpus)
            )
        )

    return available_gpus[:num_requested_gpus] if num_requested_gpus else available_gpus


class ModelMGPU(Model):
    """Wrapper for distributing model across multiple gpus"""

    def __init__(self, ser_model, gpus):
        pmodel = multi_gpu_model(ser_model, gpus)  # noqa: F821
        self.__dict__.update(pmodel.__dict__)
        self._smodel = ser_model

    def __getattribute__(self, attrname):
        """Override load and save methods to be used from the serial-model. The
        serial-model holds references to the weights in the multi-gpu model.
        """
        # return Model.__getattribute__(self, attrname)
        if "load" in attrname or "save" in attrname:
            return getattr(self._smodel, attrname)

        return super(ModelMGPU, self).__getattribute__(attrname)