pipeline-parallelism-with-controllable-memory

Sleeping

App Files Files Community

Nyamdavaa Amar commited on Jun 12, 2024

Commit

3d4d40d

1 Parent(s): f8e95f6

Pipeline Parallelism with Controllable Memory

Browse files

Files changed (11) hide show

README.md +4 -10
adaptive_schedule.py +627 -0
app.py +152 -96
auto_schedule.py +0 -564
description1.md +3 -9
description2.md +5 -32
interleaved_variant.py +107 -0
schedule1f1bv.py +271 -0
svg_event.py +1 -1
type2.py +163 -0
v_schedule.py +0 -474

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Zero Bubble Pipeline Parallellism
 emoji: 🏆
 colorFrom: indigo
 colorTo: red
@@ -11,14 +11,8 @@ license: apache-2.0
 ---
-# Zero Bubble Pipeline Parallelism
-Zero Bubble Pipeline Parallelism is a novel pipeline parallelism algorithm able to reduce the bubble of pipeline parallelism to almost zero while preserving synchronous semantics.
-Check out our paper at:
-* [Arxiv Version with ZBV](https://arxiv.org/abs/2401.10241)
-* [ICLR Accepted version with ZB1P and ZB2P](https://openreview.net/pdf?id=tuzTN0eIO5)
-Try out our implementation based on Megatron on [https://github.com/sail-sg/zero-bubble-pipeline-parallelism](https://github.com/sail-sg/zero-bubble-pipeline-parallelism)
-Experiments shows zero bubble pipeline parallelism can accelerate training up to 30% with a similar memory comsumption. A detailed table of experiments is coming soon.

 ---
+title: Pipeline Parallellism with Controllable Memory
 emoji: 🏆
 colorFrom: indigo
 colorTo: red
 ---
+# Pipeline Parallellism with Controllable Memory
+Check out our paper at [Arxiv](https://arxiv.org/abs/2405.15362).
+Bubble Rate here is calculated as (1 - longest stage time/(F+B+W)/m).

adaptive_schedule.py ADDED Viewed

	@@ -0,0 +1,627 @@

+pattern_size = 6
+from collections import Counter
+from dataclasses import dataclass
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    chunk: int
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+def transform_schedule(schedule, f, b, w, c):
+    result = []
+    stage_order = []
+    local_prev = {}
+    stages = len(schedule)
+    for sid, stage in enumerate(schedule):
+        counter = Counter()
+        order = []
+        for p in stage:
+            if not p.strip():
+                continue
+            mb = counter.get(p, 0)
+            if order:
+                local_prev[(sid, p, mb)] = order[-1]
+            order.append((p, mb))
+            counter.update(p)
+        stage_order.append(order)
+    nmb = max(counter.values())
+    time_map = {}
+    cost = {
+        'F': f,
+        'B': b,
+        'W': w,
+        'f': f,
+        'b': b,
+        'w': w,
+    }
+    def get_time(stage, type, mb):
+        if (stage, type, mb) in time_map:
+            return time_map.get((stage, type, mb))
+        time = 0
+        if (stage, type, mb) in local_prev:
+            time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in ('F', 'B') and stage > 0:
+            time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in ('f', 'b') and stage + 1< len(schedule):
+            time = max(time, get_time(stage + 1, type, mb) + c)
+        # print(f'{stage} {type}:{mb}', time + cost[type])
+        time_map[(stage, type, mb)] = time + cost[type]
+        return time_map[(stage, type, mb)]
+    r = 0
+    for sid, stage in enumerate(schedule):
+        r = max(get_time(sid, 'W', nmb - 1) - get_time(sid, 'F', 0) + f, r)
+        r = max(get_time(sid, 'w', nmb - 1) - get_time(sid, 'F', 0) + f, r)
+    for sid, stage in enumerate(stage_order):
+        result_stage = []
+        for p, mb in stage:
+            result_stage.append(ScheduledNode(
+                p.upper(),
+                p in ('f', 'B', 'W'),
+                sid,
+                mb,
+                get_time(sid, p, mb) - cost[p],
+                get_time(sid, p, mb)
+            )
+            )
+        result.append(result_stage)
+    return result
+def evaluate_schedule(schedule, f, b, w, c):
+    stage_order = []
+    local_prev = {}
+    stages = len(schedule)
+    for sid, stage in enumerate(schedule):
+        counter = Counter()
+        order = []
+        for p in stage:
+            if not p.strip():
+                continue
+            mb = counter.get(p, 0)
+            if order:
+                local_prev[(sid, p, mb)] = order[-1]
+            order.append((p, mb))
+            counter.update(p)
+        stage_order.append(order)
+    nmb = max(counter.values())
+    time_map = {}
+    cost = {
+        'F': f,
+        'B': b,
+        'W': w,
+        'f': f,
+        'b': b,
+        'w': w,
+    }
+    def get_time(stage, type, mb):
+        if (stage, type, mb) in time_map:
+            return time_map.get((stage, type, mb))
+        time = 0
+        if (stage, type, mb) in local_prev:
+            time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in ('F', 'B') and stage > 0:
+            time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in ('f', 'b') and stage + 1< len(schedule):
+            time = max(time, get_time(stage + 1, type, mb) + c)
+        # print(f'{stage} {type}:{mb}', time + cost[type])
+        time_map[(stage, type, mb)] = time + cost[type]
+        return time_map[(stage, type, mb)]
+    r = 0
+    for sid, stage in enumerate(schedule):
+        r = max(get_time(sid, 'W', nmb - 1) - get_time(sid, 'F', 0) + f, r)
+        r = max(get_time(sid, 'w', nmb - 1) - get_time(sid, 'F', 0) + f, r)
+    return r
+def get_pattern_str(pos):
+    pattern = [" "] * pattern_size
+    notations = "FfBbWw"
+    for i, v in enumerate(pos):
+        if v < 0:
+            continue
+        pattern[v] = notations[i]
+    _str = ""
+    for v in pattern:
+        _str += v
+    return _str
+def get_peak_mem(schedules, return_all=False):
+    max_peak = 0
+    all_peak = []
+    for schedule_ in schedules:
+        peak, mem = 0, 0
+        for v in schedule_:
+            if v in "Ff":
+                mem += 1
+            elif v in "Ww":
+                mem -= 1
+            peak = max(peak, mem)
+        all_peak.append(peak)
+        max_peak = max(max_peak, peak)
+    if return_all:
+        return all_peak
+    return max_peak
+debug = False
+def print_schedules(schedules):
+    if not debug:
+        return
+    for seq in schedules:
+        _str = ""
+        for v in seq:
+            _str += v
+        print(_str)
+def calc_bubble(schedules):
+    stage_bubbles = []
+    for i in range(len(schedules)):
+        max_len = 0
+        count = 0
+        for j in range(len(schedules[i])):
+            if schedules[i][j] != ' ':
+                max_len = j + 1
+                count += 1
+        stage_bubbles.append(max_len - count - i)
+    return stage_bubbles
+def init_repeated_schedule(p, m, patterns):
+    repeated = []
+    _len = 4 * p + m + 1
+    for i in range(p):
+        str_i = get_pattern_str(patterns[i]) * _len
+        repeated_i = []
+        for v in str_i:
+            repeated_i.append(v)
+        repeated.append(repeated_i)
+    return repeated
+def clear_invalid(repeated, stage, pos, offset=-1):
+    while 0 <= pos < len(repeated[stage]):
+        repeated[stage][pos] = ' '
+        pos += offset * pattern_size
+    return repeated
+def clear_invalid_index(repeated, m):
+    p = len(repeated)
+    index = pattern_size
+    for identifier in ['F', 'f', 'B', 'b']:
+        if identifier in ['F', 'B']:
+            _iter = range(p)
+        else:
+            _iter = range(p - 1, -1, -1)
+        for i in _iter:
+            for j in range(pattern_size):
+                if repeated[i][index] == identifier:
+                    clear_invalid(repeated, i, index - pattern_size, offset=-1)
+                    clear_invalid(repeated, i, index + pattern_size * m, offset=1)
+                    index += 1
+                    if identifier in ['B', 'b']:
+                        w_identifier = {'B': 'W', 'b': 'w'}[identifier]
+                        for k in range(pattern_size):
+                            if repeated[i][index + k] == w_identifier:
+                                clear_invalid(repeated, i, index + k - pattern_size, offset=-1)
+                                clear_invalid(repeated, i, index + k + pattern_size * m, offset=1)
+                                break
+                    break
+                index += 1
+    return repeated
+def process_warmup_without_increasing_peak_mem(schedules, m):
+    """
+    FFFFFFFFFF     fBWfBWfBWfBWfBW  b
+     FFFFFFFFF    f fBWfBWfBWfBWFBWb
+      FFFFFFFF   f f fBWfBWfBWFBW b
+       FFFFFFF  f f f fBWfBWFBW Bb
+        FFFFFF f f f f fBWFBWFBWb
+         FFFFFfFf f f f  BWFBW b
+          FFFfFfFfFf f    BW Bb
+           FfFfFfFfFfF     BWb
+    We reorganize the warmup phase in the following way (i -> pipeline stage from 0):
+        1. Before the first B, we set #f = min(i+1, peak_mem//2), #F = peak_mem - #f
+        2. Before the first b, #f = peak_mem//2
+        3. The offset between the first B is 1
+        4. Before the first b, we use the pattern of (BWf)*j + (BWF)*k,
+           where j = max(0, peak_mem//2 - (i+1)), k = max(0, #W - j - 1)
+    """
+    # process warmup phase (before the first b)
+    p = len(schedules)
+    peak_mem = get_peak_mem(schedules)
+    peak_mem = min(peak_mem, 2 * p)
+    cnt_f, cnt_ff = [], []
+    for i in range(p):
+        cc_ff = min(i + 1, peak_mem // 2)
+        cc_ff = min(cc_ff, m)
+        cc_f = min(peak_mem - cc_ff, m)
+        cnt_f.append(cc_f)
+        cnt_ff.append(cc_ff)
+    distance_b2bb = 0
+    for j in range(len(schedules[p - 1])):
+        if schedules[p - 1][j] == 'B':
+            for k in range(j, len(schedules[p - 1])):
+                if schedules[p - 1][k] == 'b':
+                    distance_b2bb = k - j
+                    break
+            break
+    for i in range(p):
+        c_f, c_ff, c_b, c_w = 0, 0, 0, 0
+        for j in range(len(schedules[i])):
+            char = schedules[i][j]
+            if char == 'F':
+                c_f += 1
+            elif char == 'f':
+                c_ff += 1
+            elif char == 'B':
+                c_b += 1
+            elif char == 'W':
+                c_w += 1
+            elif char == 'b':
+                bj = j
+                while j < len(schedules[i]):
+                    char = schedules[i][j]
+                    if char == 'f' and c_ff < cnt_ff[p - 1]:
+                        schedules[i][j] = ' '
+                        c_ff += 1
+                    if char == 'B' and c_b < c_ff:
+                        if c_b < (2 * (p - i) + distance_b2bb) // 3 or c_b < cnt_ff[p - 1] - cnt_ff[i]:
+                            # there is empty space, or the number of B is not enough to cover extra f
+                            schedules[i][j] = ' '
+                            c_b += 1
+                    if char == 'W' and c_w < c_b:
+                        if c_w < (2 * (p - i) + distance_b2bb - 1) // 3 or c_w < cnt_ff[p - 1] - cnt_ff[i]:
+                            # there is empty space, or the number of W is not enough to cover extra f
+                            schedules[i][j] = ' '
+                            c_w += 1
+                    j += 1
+                j = bj
+                while j < len(schedules[i]):
+                    if schedules[i][j] == 'F':
+                        if c_f < c_ff or c_f < cnt_f[i] or c_f - cnt_f[i] + c_ff - cnt_ff[i] < c_w - 1:
+                            # put enough F, or there are some unused BW
+                            schedules[i][j] = ' '
+                            c_f += 1
+                    j += 1
+                break
+            else:
+                assert char == ' '
+            schedules[i][j] = ' '
+        assert c_f >= cnt_f[i] and c_ff >= cnt_ff[i]
+        assert c_w >= cnt_ff[p - 1] - cnt_ff[i] and c_b >= cnt_ff[p - 1] - cnt_ff[i]
+        j = i
+        u_f, u_ff, u_b, u_w = 0, 0, 0, 0
+        for _ in range(2 * (p - 1 - i)):
+            if u_f < cnt_f[i] and u_f < c_f:
+                schedules[i][j] = 'F'
+                u_f += 1
+            j += 1
+        for _ in range(i + 1):
+            if u_f < cnt_f[i] and u_f < c_f:
+                schedules[i][j] = 'F'
+                u_f += 1
+            j += 1
+            if u_ff < cnt_ff[i] and u_ff < c_ff:
+                schedules[i][j] = 'f'
+                u_ff += 1
+            j += 1
+        while u_f < c_f or u_ff < c_ff or u_b < c_b or u_w < c_w:
+            if u_b < c_b:
+                schedules[i][j] = 'B'
+                u_b += 1
+            j += 1
+            if u_w < c_w:
+                schedules[i][j] = 'W'
+                u_w += 1
+            j += 1
+            if u_ff < c_ff:
+                assert u_ff < u_f
+                schedules[i][j] = 'f'
+                u_ff += 1
+            elif u_f < c_f:
+                schedules[i][j] = 'F'
+                u_f += 1
+            j += 1
+    return schedules
+def squeeze_without_change_order(schedules, m):
+    p = len(schedules)
+    squeezed = [[' '] * len(schedules[_]) for _ in range(p)]
+    max_len = 0
+    for seq in squeezed:
+        assert max_len == 0 or max_len == len(seq)
+        max_len = max(max_len, len(seq))
+    identifier_cnt = [{_id: 0 for _id in "FfBbWw"} for _ in range(p)]
+    identifier_index = [{_id: -1 for _id in "FfBbWw"} for _ in range(p * m)]
+    stage_index = [0 for _ in range(p)]
+    for j in range(max_len):
+        for _dir in range(2):
+            if _dir == 0:
+                _iter = range(p)
+            else:
+                _iter = range(p - 1, -1, -1)
+            for i in _iter:
+                identifier = schedules[i][j]
+                if identifier == ' ':
+                    continue
+                if _dir == 0 and identifier in "fbw":
+                    continue
+                if _dir == 1 and identifier in "FBW":
+                    continue
+                _cnt = identifier_cnt[i][identifier]
+                assert _cnt < m, "{} - {}, {}".format(i, identifier, _cnt)
+                if identifier in "Ww" or (i == 0 and identifier in "FB") or (i == p - 1 and identifier in "fb"):
+                    if i == 0 and identifier == 'B':
+                        assert identifier_index[_cnt * p + i]['f'] >= 0
+                    if i == p - 1 and identifier == 'f':
+                        assert identifier_index[_cnt * p + i]['F'] >= 0
+                    if i == p - 1 and identifier == 'b':
+                        assert identifier_index[_cnt * p + i]['B'] >= 0
+                    index = stage_index[i]
+                elif identifier in "FB":
+                    assert identifier_index[_cnt * p + i - 1][identifier] >= 0, "{} {} {}".format(i,identifier,_cnt)
+                    index = max(identifier_index[_cnt * p + i - 1][identifier] + 1, stage_index[i])
+                elif identifier in "fb":
+                    assert identifier_index[_cnt * p + i + 1][identifier] >= 0, "{} {} {}".format(i,identifier,_cnt)
+                    index = max(identifier_index[_cnt * p + i + 1][identifier] + 1, stage_index[i])
+                else:
+                    raise
+                squeezed[i][index] = identifier
+                identifier_cnt[i][identifier] += 1
+                identifier_index[_cnt * p + i][identifier] = index
+                stage_index[i] = index + 1
+    return squeezed
+def process_cooldown(schedules, m):
+    """
+           fBW       bwbwbwbw
+          fBWBW     bwbwbwbw
+         fBWBWBW   bwbwbwbw
+        fBWBWBWBW bwbwbwbw
+       f  BWBWBWBbWbwbwbww
+      f    BWBWBbBbWbWbwwww
+     f      BWBbBbBbWbWWwwww
+    f        BbBbBbBbWWWWwwww
+    We reorganize the cooldown phase in the following way (i -> pipeline stage from 0):
+        1. After the last f, we set #b = (peak_mem+1)//2, and #B = min(i+1, peak_mem - #b)
+        2. After the last f, we make all the dependencies as tight as possible
+    """
+    p = len(schedules)
+    peak_mem = get_peak_mem(schedules)
+    assert peak_mem <= 2 * p
+    max_bb = (peak_mem + 1) // 2
+    max_bb = min(max_bb, m)
+    max_b = min(peak_mem - max_bb, m)
+    # 1: reorganize B/b and remove W/w in cooldown phase
+    starting_index = -1
+    for i in range(p):
+        c_b, c_bb, c_w, c_ww = 0, 0, 0, 0
+        last_ff_index = -1
+        # collect B/b which can be reorganized
+        for j in range(len(schedules[i]) - 1, -1, -1):
+            char = schedules[i][j]
+            if char == 'f' and last_ff_index == -1:
+                last_ff_index = j
+            if char == 'B' and c_b < i + 1 and c_b < max_b:
+                schedules[i][j] = ' '
+                c_b += 1
+            if char == 'b' and c_bb < max_bb:
+                schedules[i][j] = ' '
+                c_bb += 1
+        # clear W in the tail (#W + #w = peak_mem)
+        for j in range(len(schedules[i]) - 1, -1, -1):
+            char = schedules[i][j]
+            if char == 'W' and c_w + c_ww < peak_mem:
+                schedules[i][j] = ' '
+                c_w += 1
+            if char == 'w' and c_w + c_ww < peak_mem:
+                schedules[i][j] = ' '
+                c_ww += 1
+        if i == 0:
+            starting_index = last_ff_index
+        # reorganize B/b in the tail
+        for k in range(c_bb):
+            index = starting_index - i + 2 * p - 2 * k
+            assert schedules[i][index] == ' ', "{} {} {}".format(schedules[i][index], k, i)
+            schedules[i][index] = 'b'
+        for k in range(c_b):
+            index = starting_index + 1 + i - 2 * k
+            assert schedules[i][index] == ' ', schedules[i][index]
+            schedules[i][index] = 'B'
+    # 2: squeeze cooldown phase without change order
+    schedules = squeeze_without_change_order(schedules, m)
+    # 3: add W back in cooldown phase
+    for i in range(p):
+        c_w, c_ww = 0, 0
+        last_w_index = -2
+        for j in range(len(schedules[i]) - 1, -1, -1):
+            if schedules[i][j] in "Ww":
+                if last_w_index < 0:
+                    schedules[i][j] = ' '
+                    last_w_index += 1
+                else:
+                    last_w_index = j
+                    break
+        for j in range(len(schedules[i])):
+            char = schedules[i][j]
+            if char == 'B':
+                c_w += 1
+            elif char == 'b':
+                c_ww += 1
+            elif char == 'W':
+                c_w -= 1
+            elif char == 'w':
+                c_ww -= 1
+            if char == ' ' and j > last_w_index:
+                if c_w > 0:
+                    schedules[i][j] = 'W'
+                    c_w -= 1
+                elif c_ww > 0:
+                    schedules[i][j] = 'w'
+                    c_ww -= 1
+    schedules = squeeze_without_change_order(schedules, m)
+    return schedules
+def schedule_by_pattern(p, m, patterns):
+    schedules = init_repeated_schedule(p, max(m, 2 * p), patterns)
+    schedules = clear_invalid_index(schedules,  max(m, 2 * p))
+    print_schedules(schedules)
+    init_peak_mem = get_peak_mem(schedules)
+    if init_peak_mem > 2 * p:
+        return None, init_peak_mem, [6 *  max(m, 2 * p)] * p
+    schedules = process_warmup_without_increasing_peak_mem(schedules,  max(m, 2 * p))
+    for sid in range(len(schedules)):
+        cnt = {_id: 0 for _id in "FfBbWw"}
+        for i in range(len(schedules[sid])):
+            if(schedules[sid][i] == ' '):
+                continue
+            if cnt[schedules[sid][i]] >= m:
+                schedules[sid][i] = ' '
+            else:
+                cnt[schedules[sid][i]] += 1
+    print_schedules(schedules)
+    peak_mem = get_peak_mem(schedules)
+    if peak_mem > init_peak_mem:
+        return None, init_peak_mem, [6 * m] * p
+    schedules = squeeze_without_change_order(schedules, m)
+    print_schedules(schedules)
+    schedules = process_cooldown(schedules, m)
+    print_schedules(schedules)
+    peak_mem = get_peak_mem(schedules)
+    if peak_mem > init_peak_mem:
+        return None, init_peak_mem, [6 * m] * p
+    stage_bubbles = calc_bubble(schedules)
+    return schedules, peak_mem, stage_bubbles
+def fill_w_in_pattern(pattern):
+    f, ff, b, bb, w, ww = 0, 1, 2, 3, 4, 5
+    vis = [False] * pattern_size
+    for v in pattern:
+        if v >= 0:
+            vis[v] = True
+    assert pattern[b] >= 0 and pattern[bb] >= 0
+    for v, vw in [(b, w), (bb, ww)]:
+        for j in range(pattern_size):
+            pos = (pattern[v] + j) % pattern_size
+            if not vis[pos]:
+                pattern[vw] = pos
+                vis[pos] = True
+                break
+    return pattern
+def get_whole_pattern(pattern_0, offset_0, offset_1, len_0, p):
+    whole_pattern = [pattern_0]
+    for i in range(p - 1):
+        last_pattern = whole_pattern[i]
+        new_pattern = [-1] * pattern_size
+        vis = [False] * pattern_size
+        if i < len_0:
+            offset = offset_0
+        else:
+            offset = offset_1
+        for v, v_o in enumerate(offset):
+            pos = (last_pattern[v] + v_o + pattern_size) % pattern_size
+            assert 0 <= pos < pattern_size
+            if vis[pos]:
+                return None
+            vis[pos] = True
+            new_pattern[v] = pos
+        new_pattern = fill_w_in_pattern(new_pattern)
+        whole_pattern.append(new_pattern)
+    return whole_pattern
+def schedule(p, m, cost, max_mem):
+    f, ff, b, bb, w, ww = 0, 1, 2, 3, 4, 5
+    available_patterns = []
+    for ff_i in range(1, pattern_size):
+        for b_i in range(1, pattern_size):
+            for bb_i in range(1, pattern_size):
+                if ff_i == b_i or ff_i == bb_i or b_i == bb_i:
+                    continue
+                pattern = [0, ff_i, b_i, bb_i, -1, -1]
+                pattern = fill_w_in_pattern(pattern)
+                available_patterns.append(pattern)
+    available_offsets = []
+    for f_o in range(1, pattern_size + 1):
+        for ff_o in range(1, pattern_size + 1):
+            for b_o in range(1, pattern_size + 1):
+                if f_o != b_o:
+                    continue
+                bb_o = ff_o + b_o - f_o
+                if bb_o < 1 or bb_o > pattern_size:
+                    continue
+                if bb_o + ff_o + b_o + f_o > 2 * pattern_size:
+                    continue
+                # if bb_o + ff_o + b_o + f_o != 6:
+                #     continue
+                offset = [f_o, - ff_o, b_o, - bb_o]
+                if min(ff_o, bb_o) > 1:
+                    continue
+                available_offsets.append(offset)
+    print(available_offsets, len(available_patterns))
+    available_offsets = [
+        [1, -1, 1, -1],
+        [2, -1, 2, -1],
+        [3, -1, 3, -1],
+        [4, -1, 4, -1],
+        [5, -1, 5, -1]
+    ]
+    best_schedule = None
+    best_bubble = None
+    peak_mem2min_bubble = {}
+    for pattern_0 in available_patterns:
+        for i_0 in range(len(available_offsets)):
+            for i_1 in range(i_0 + 1):
+                for len_0 in range(1, p):
+                    offset_0 = available_offsets[i_0]
+                    offset_1 = available_offsets[i_1]
+                    whole_pattern = get_whole_pattern(pattern_0, offset_0, offset_1, len_0, p)
+                    if whole_pattern is None:
+                        continue
+                    # for pattern in whole_pattern:
+                    #     print(get_pattern_str(pattern))
+                    # print(offset)
+                    s, peak_mem, bubbles = schedule_by_pattern(p, m, whole_pattern)
+                    if s is None:
+                        continue
+                    if peak_mem > 2 * p or peak_mem > max_mem:
+                        continue
+                    max_bubble = max(bubbles)
+                    max_bubble = evaluate_schedule(s, *cost)
+                    if best_schedule is None or max_bubble < best_bubble:
+                        best_schedule, best_bubble = s, max_bubble
+    res = transform_schedule(best_schedule, *cost)
+    return res

app.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import gradio as gr
-import auto_schedule
-import v_schedule
 import hand_schedule
 from PIL import Image
 from svg_event import render_manual_graph
 import pathlib
-def greet(name, is_morning, temperature):
-    salutation = "Good morning" if is_morning else "Good evening"
-    greeting = f"{salutation} {name}. It is {temperature} degrees today"
-    celsius = (temperature - 32) * 5 / 9
-    return greeting, round(celsius, 2)
 def percentage(x):
   return f"{x*100:.2f}%"
@@ -25,6 +22,26 @@ def get_schedule_time(result):
   )
   return time
 img_queue = []
 def get_schedule_image(result, max_time):
   result = [
@@ -41,80 +58,87 @@ def get_schedule_image(result, max_time):
 def calculate(p, m, f, b, w, c, mem):
-  if mem < p:
-    baseline_time=None
-    baseline_bubble=None
-    baseline_acceleration=None
-    baseline_image=None
-    baseline_result=None
-  else:
-    baseline_result = hand_schedule.get_hand_schedule(p, m, f, b + w, 0, c)
-    baseline_result = [
-        list(filter(lambda x: x.type in {'F', 'B'}, r)) for r in baseline_result
-    ]
-    baseline_time = get_schedule_time(baseline_result)
-    baseline_bubble=percentage(baseline_time/(f+b+w)/m - 1)
-    baseline_acceleration=percentage(0)
-  zb_result = auto_schedule.auto_schedule(p, m, auto_schedule.GraphConfig(
-        cost_f=f,
-        cost_b=b,
-        cost_w=w,
-        cost_comm=c,
-        max_mem=mem * 2,
-        print_scaling=1000
-  ))
-  zb_time=get_schedule_time(zb_result)
-  zb_bubble=percentage(zb_time/(f+b+w)/m - 1)
-  zb_acceleration=percentage(baseline_time/zb_time - 1) if baseline_time is not None else None
-  if mem < p:
-    zbv_time=None
-    zbv_bubble=None
-    zbv_acceleration=None
-    zbv_image=None
-    zbv_result=None
-  else:
-    zbv_graph = v_schedule.PipelineGraph(
-                  n_stage=p,
-                  n_micro=m,
-                  f_cost=f/2,
-                  b_cost=b/2,
-                  w_cost=w/2,
-                  c_cost=c,
-                  f_mem=2,
-                  b_mem=-1,
-                  w_mem=-1,
-                  max_mem=mem * 4,
-    )
-    zbv_result = zbv_graph.get_v_schedule()
-    zbv_time = get_schedule_time(zbv_result)
-    zbv_bubble=percentage(zbv_time/(f+b+w)/m - 1)
-    zbv_acceleration=percentage(baseline_time/zbv_time - 1) if baseline_time is not None else None
-  max_time = max(filter(lambda x: x is not None, [baseline_time, zb_time, zbv_time]))
   print(max_time)
   if baseline_result is not None:
     baseline_image = get_schedule_image(baseline_result, max_time)
-  if zb_result is not None:
-    zb_image = get_schedule_image(zb_result, max_time)
-  if zbv_result is not None:
-    zbv_image = get_schedule_image(zbv_result, max_time)
-  return [baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image]
 with gr.Blocks() as demo:
   gr.Markdown(open("description1.md").read())
   gr.Markdown("# Pipeline Scheduler Playground")
   presets = {
-    'Ideal Case 1p': (4, 12, 20, 20, 20, 0, '1p (Same as 1F1B)'),
-    'Ideal Case 2p': (4, 12, 20, 20, 20, 0, '2p'),
-    'Real Case 1p': (4, 12, 1049, 1122, 903, 79, '1p (Same as 1F1B)'),
-    'Real Case 2p': (4, 12, 1049, 1122, 903, 79, '2p'),
   }
   preset_buttons = {}
@@ -129,25 +153,31 @@ with gr.Blocks() as demo:
       with gr.Group():
         gr.Markdown("Basic Parameters")
         with gr.Row():
-          p=gr.Number(label="Number of stages (p)", value=4, interactive=True, precision=0)
           m=gr.Number(label="Number of microbatches (m)", value=12, interactive=True, precision=0)
     with gr.Column(scale=2):
       with gr.Group():
-        gr.Markdown("Costs. All costs are used as integers. For ZBV schedules, this is the time of two virtual stages on a stage combined.")
         with gr.Row():
-          f=gr.Number(label="Time of F", value=100, interactive=True, precision=0)
-          b=gr.Number(label="Time of B", value=110, interactive=True, precision=0)
-          w=gr.Number(label="Time of W", value=90, interactive=True, precision=0)
-          c=gr.Number(label="Time of one P2P communication", value=5, interactive=True, precision=0)
   with gr.Group():
     gr.Markdown("Activation memory limit.")
     def update_mem(p, s, mem):
       print("update")
-      if s=="custom":
         return mem
-      return int(p*float(s.split('p')[0]) + 0.5)
-    memsel=gr.Radio(choices=["1p (Same as 1F1B)", "1.5p", "2p", "3p", "custom"], value="1p (Same as 1F1B)")
-    mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For ZBV schedules, this is relative to two virtual stages on a stage combined.", value=p.value, interactive=True, precision=0)
     memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
@@ -157,31 +187,53 @@ with gr.Blocks() as demo:
     gr.Markdown("1F1B")
     with gr.Row():
       with gr.Column(scale=1):
-        baseline_time=gr.Textbox("", label="Longest Stage Time")
-        baseline_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
         baseline_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
         baseline_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
-    gr.Markdown("ZB Schedule")
     with gr.Row():
       with gr.Column(scale=1):
-        zb_time=gr.Textbox("", label="Longest Stage Time")
-        zb_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
-        zb_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
-        zb_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
-    gr.Markdown("ZBV Schedule")
     with gr.Row():
       with gr.Column(scale=1):
-        zbv_time=gr.Textbox("", label="Longest Stage Time")
-        zbv_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
-        zbv_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
-        zbv_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
-    button.click(calculate, inputs=[p, m, f, b, w, c, mem], outputs=[baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image])
   for (k, v) in presets.items():
     def update_preset(pb, p, m, f, b, w, c, mem):
@@ -192,6 +244,10 @@ with gr.Blocks() as demo:
     preset_buttons[k].click(
        update_preset,
        inputs=[preset_buttons[k], p, m, f, b, w, c, mem],
-       outputs=[p, m, f, b, w, c, memsel, baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image])
-  gr.Markdown(open("description2.md").read())
 demo.launch()

 import gradio as gr
 import hand_schedule
+import adaptive_schedule
+import interleaved_variant
+import type2
+import schedule1f1bv
 from PIL import Image
 from svg_event import render_manual_graph
 import pathlib
 def percentage(x):
   return f"{x*100:.2f}%"
   )
   return time
+def get_memory_usage(result):
+  max_mem = 0
+  has_w = False
+  for r in result:
+    for x in r:
+      if x.type in ('W', 'w'):
+        has_w = True
+  for r in result:
+    cur = 0
+    for x in r:
+      if x.type in ('F', 'f'):
+        cur += 1
+      if x.type in ('W', 'w'):
+        cur -= 1
+      if has_w == False and x.type in ('B', 'b'):
+        cur -= 1
+      max_mem = max(max_mem, cur)
+  return max_mem
 img_queue = []
 def get_schedule_image(result, max_time):
   result = [
 def calculate(p, m, f, b, w, c, mem):
+  baseline_result = hand_schedule.get_hand_schedule(p, m, f, b + w, 0, c)
+  baseline_result = [
+      list(filter(lambda x: x.type in {'F', 'B'}, r)) for r in baseline_result
+  ]
+  baseline_time = get_schedule_time(baseline_result)
+  baseline_bubble=percentage(baseline_time/(f+b+w)/m - 1)
+  baseline_mem = get_memory_usage(baseline_result)
+  baseline_acceleration=percentage(0)
+  adapt_result = adaptive_schedule.schedule(
+                p,
+                m,
+                [f/2, b/2, w/2, c],
+                max_mem=mem * 2
+  )
+  adapt_time = get_schedule_time(adapt_result)
+  adapt_mem = get_memory_usage(adapt_result) / 2
+  adapt_bubble=percentage(adapt_time/(f+b+w)/m - 1)
+  adapt_acceleration=percentage(baseline_time/adapt_time - 1) if baseline_time is not None else None
+  schedule1f1bv_result = schedule1f1bv.schedule(
+                p,
+                m,
+                [f / 2, b / 2, w / 2, c]
+  )
+  schedule1f1bv_time = get_schedule_time(schedule1f1bv_result)
+  schedule1f1bv_mem = get_memory_usage(schedule1f1bv_result) / 2
+  schedule1f1bv_bubble=percentage(schedule1f1bv_time/(f+b+w)/m - 1)
+  schedule1f1bv_acceleration=percentage(baseline_time/schedule1f1bv_time - 1) if baseline_time is not None else None
+  type2_result = type2.schedule(
+                p,
+                m,
+                [f, b, w, c]
+  )
+  type2_time = get_schedule_time(type2_result)
+  type2_mem = get_memory_usage(type2_result)
+  type2_bubble=percentage(type2_time/(f+b+w)/m - 1)
+  type2_acceleration=percentage(baseline_time/type2_time - 1) if baseline_time is not None else None
+  interleaved_result = interleaved_variant.get_interleaved_variation(
+                p,
+                m,
+                [f/2, b/2, w/2, c]
+  )
+  interleaved_time = get_schedule_time(interleaved_result)
+  interleaved_mem = get_memory_usage(interleaved_result) / 2
+  interleaved_bubble=percentage(interleaved_time/(f+b+w)/m - 1)
+  interleaved_acceleration=percentage(baseline_time/interleaved_time - 1) if baseline_time is not None else None
+  max_time = max(filter(lambda x: x is not None, [baseline_time, adapt_time, interleaved_time, type2_time, schedule1f1bv_time]))
   print(max_time)
   if baseline_result is not None:
     baseline_image = get_schedule_image(baseline_result, max_time)
+  if adapt_result is not None:
+    adapt_image = get_schedule_image(adapt_result, max_time)
+  if interleaved_result is not None:
+    interleaved_image = get_schedule_image(interleaved_result, max_time)
+  if type2_result is not None:
+    type2_image = get_schedule_image(type2_result, max_time)
+  if schedule1f1bv_result is not None:
+    schedule1f1bv_image = get_schedule_image(schedule1f1bv_result, max_time)
+  return [baseline_acceleration, baseline_mem, baseline_bubble, baseline_image,
+          adapt_acceleration, adapt_mem, adapt_bubble, adapt_image,
+          schedule1f1bv_acceleration, schedule1f1bv_mem, schedule1f1bv_bubble, schedule1f1bv_image,
+          type2_acceleration, type2_mem, type2_bubble, type2_image,
+          interleaved_acceleration, interleaved_mem, interleaved_bubble, interleaved_image]
 with gr.Blocks() as demo:
   gr.Markdown(open("description1.md").read())
   gr.Markdown("# Pipeline Scheduler Playground")
   presets = {
+    'Real Case': (6, 12, 1049, 1122, 903, 79, 'V-Half'),
+    'Ideal Case': (6, 12, 20, 20, 20, 0, 'V-Min'),
+    'Zero Bubble Case': (6, 12, 1049, 1122, 903, 79, 'V-ZB')
   }
   preset_buttons = {}
       with gr.Group():
         gr.Markdown("Basic Parameters")
         with gr.Row():
+          p=gr.Number(label="Number of stages (p)", value=6, interactive=True, precision=0)
           m=gr.Number(label="Number of microbatches (m)", value=12, interactive=True, precision=0)
     with gr.Column(scale=2):
       with gr.Group():
+        gr.Markdown("Costs. All costs are used as integers. For chunked schedules, this is the time of two virtual stages on a stage combined.")
         with gr.Row():
+          f=gr.Number(label="Time of F", value=1049, interactive=True, precision=0)
+          b=gr.Number(label="Time of B", value=1122, interactive=True, precision=0)
+          w=gr.Number(label="Time of W", value=903, interactive=True, precision=0)
+          c=gr.Number(label="Time of one P2P communication", value=79, interactive=True, precision=0)
   with gr.Group():
     gr.Markdown("Activation memory limit.")
     def update_mem(p, s, mem):
       print("update")
+      if s == "custom":
         return mem
+      if s == "V-Min":
+        return (p + 4) // 3
+      if s == "V-Half":
+        return (p + 2) // 2
+      if s == "V-ZB":
+        return p
+      assert False
+    memsel=gr.Radio(choices=["V-Min", "V-Half", "V-ZB", "custom"], value="V-Half")
+    mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For chunked schedules, this is relative to two virtual stages on a stage combined.", value=(p.value + 2) // 2, interactive=True, precision=0)
     memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     gr.Markdown("1F1B")
     with gr.Row():
       with gr.Column(scale=1):
         baseline_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+        baseline_mem=gr.Textbox("", label="Maximum memory usage")
+        baseline_bubble=gr.Textbox("", label="Bubble Rate")
       with gr.Column(scale=4):
         baseline_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
+    gr.Markdown("Adaptive Scheduler")
+    with gr.Row():
+      with gr.Column(scale=1):
+        adapt_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+        adapt_mem=gr.Textbox("", label="Maximum memory usage")
+        adapt_bubble=gr.Textbox("", label="Bubble Rate")
+      with gr.Column(scale=4):
+        adapt_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
+  gr.Markdown(open("description2.md").read())
+  with gr.Group():
+    gr.Markdown("1F1B-V Schedule")
     with gr.Row():
       with gr.Column(scale=1):
+        schedule1f1bv_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+        schedule1f1bv_mem=gr.Textbox("", label="Maximum memory usage")
+        schedule1f1bv_bubble=gr.Textbox("", label="Bubble Rate")
       with gr.Column(scale=4):
+        schedule1f1bv_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
+    gr.Markdown("Two microbatch in one building block schedule")
     with gr.Row():
       with gr.Column(scale=1):
+        type2_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+        type2_mem=gr.Textbox("", label="Maximum memory usage")
+        type2_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
       with gr.Column(scale=4):
+        type2_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
+  with gr.Group():
+    gr.Markdown("Interleaved 1F1B Schedule")
+    with gr.Row():
+      with gr.Column(scale=1):
+        interleaved_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+        interleaved_mem=gr.Textbox("", label="Maximum memory usage")
+        interleaved_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
+      with gr.Column(scale=4):
+        interleaved_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
+    button.click(calculate, inputs=[p, m, f, b, w, c, mem], outputs=[baseline_acceleration, baseline_mem, baseline_bubble, baseline_image,
+          adapt_acceleration, adapt_mem, adapt_bubble, adapt_image,
+          schedule1f1bv_acceleration, schedule1f1bv_mem, schedule1f1bv_bubble, schedule1f1bv_image,
+          type2_acceleration, type2_mem, type2_bubble, type2_image,
+          interleaved_acceleration, interleaved_mem, interleaved_bubble, interleaved_image])
   for (k, v) in presets.items():
     def update_preset(pb, p, m, f, b, w, c, mem):
     preset_buttons[k].click(
        update_preset,
        inputs=[preset_buttons[k], p, m, f, b, w, c, mem],
+       outputs=[p, m, f, b, w, c, memsel,
+          baseline_acceleration, baseline_mem, baseline_bubble, baseline_image,
+          adapt_acceleration, adapt_mem, adapt_bubble, adapt_image,
+          schedule1f1bv_acceleration, schedule1f1bv_mem, schedule1f1bv_bubble, schedule1f1bv_image,
+          type2_acceleration, type2_mem, type2_bubble, type2_image,
+          interleaved_acceleration, interleaved_mem, interleaved_bubble, interleaved_image])
 demo.launch()

auto_schedule.py DELETED Viewed

@@ -1,564 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Set
-@dataclass
-class GraphConfig:
-    mem_f: float = 2
-    mem_b: float = -1
-    mem_w: float = -1
-    max_mem: float = None
-    cost_f: int = 1
-    cost_b: int = 1
-    cost_w: int = 1
-    cost_comm: int = 0
-    print_scaling: int = 1
-    def __post_init__(self):
-        assert type(self.cost_f) is int
-        assert type(self.cost_b) is int
-        assert type(self.cost_w) is int
-        assert type(self.cost_comm) is int
-        assert self.mem_f + self.mem_b + self.mem_w == 0
-@dataclass(eq=True, frozen=True)
-class ScheduledNode:
-    type: str
-    stage: int
-    minibatch: int
-    start_time: int
-    completion_time: int
-    rollback: bool = False
-@dataclass
-class Graph:
-    nstages: int
-    nmb: int
-    nnodes: int
-    config: GraphConfig
-    parents: List[Set[int]] = None
-    name: List[str] = None
-    # ID mapping:
-    # F[stage][minibatch]: 0..STAGE* MB
-    # B[stage][minibatch]: STAGE* MB .. 2 * STAGE * MB
-    # W[stage][minibatch]: 2 * STAGE* MB .. 3 * STAGE * MB
-    def get_id(self, type, stage, mb):
-        return type * (self.nstages * self.nmb) + stage * self.nmb + mb
-    def get_stage(self, id):
-        return (id // self.nmb) % self.nstages
-    def get_cost(self, id):
-        type = id // (self.nstages * self.nmb)
-        return [self.config.cost_f, self.config.cost_b, self.config.cost_w][type]
-    def get_mem(self, id):
-        type = id // (self.nstages * self.nmb)
-        return [self.config.mem_f, self.config.mem_b, self.config.mem_w][type]
-    @classmethod
-    def build_graph(cls, nstages, nmb, config):
-        nnodes = nstages * nmb * 3
-        g = Graph(nstages=nstages, nmb=nmb, nnodes=nnodes, config=config)
-        parents = []
-        name = []
-        for type in range(3):
-            for stage in range(nstages):
-                for mb in range(nmb):
-                    p = set()
-                    if type == 0:
-                        name.append(f'F{mb}')
-                        if stage > 0:
-                            p.add(g.get_id(type, stage - 1, mb))
-                        if mb > 0:
-                            p.add(g.get_id(type, stage, mb - 1))
-                    elif type == 1:
-                        name.append(f'B{mb}')
-                        if stage == nstages - 1:
-                            p.add(g.get_id(0, stage, mb))
-                        else:
-                            p.add(g.get_id(type, stage + 1, mb))
-                        if mb > 0:
-                            p.add(g.get_id(type, stage, mb - 1))
-                    elif type == 2:
-                        name.append(f'W{mb}')
-                        p.add(g.get_id(1, stage, mb))
-                        if mb > 0:
-                            p.add(g.get_id(type, stage, mb - 1))
-                    else:
-                        assert False
-                    parents.append(p)
-        g.name = name
-        g.parents = parents
-        return g
-    # Manual ordering producing this kind of schedule:
-    # fffffffbfbfbfbfbfbwbwbwbwbwbwbwwwwww
-    #  fffffbfbfbfbfbfbfbfbwbwbwbwbwwwwwwww
-    #   fffbfbfbfbfbfbfbfbfbfbwbwbwwwwwwwwww
-    #    fbfbfbfbfbfbfbfbfbfbfbfbwwwwwwwwwwww
-    # Returns the order index of each node on its own stage
-    def manual_order(
-        self, allow_bubble_before_first_b=False, prioritize_b=False, no_bubble_greedy=True
-    ):
-        order = [0] * self.nnodes
-        f = [0] * self.nstages
-        b = [0] * self.nstages
-        w = [0] * self.nstages
-        o = [0] * self.nstages
-        m = [0] * self.nstages
-        e = [0] * self.nstages
-        t = [0] * self.nnodes
-        max_mem = self.config.max_mem or self.get_mem(self.get_id(0, 0, 0)) * self.nmb * 3
-        comm = self.config.cost_comm
-        order_str = [""] * self.nstages
-        stage_bubble = [0] * self.nstages
-        def get_max_bubble():
-            max_bubble = 0
-            for bb in stage_bubble:
-                max_bubble = max(max_bubble, bb)
-            return max_bubble
-        def put(stage_j, type_k):
-            if type_k == 0:
-                _i = f[stage_j]
-            elif type_k == 1:
-                _i = b[stage_j]
-            else:
-                _i = w[stage_j]
-            _j = stage_j
-            _id = self.get_id(type_k, _j, _i)
-            _mem = self.get_mem(_id)
-            _cost = self.get_cost(_id)
-            assert m[_j] + _mem <= max_mem
-            tmp = e[_j] + _cost
-            no_bubble = tmp
-            if _j > 0 and type_k == 0:
-                tmp = max(tmp, t[self.get_id(0, _j - 1, _i)] + comm + _cost)
-            if _j < self.nstages - 1 and type_k == 1:
-                tmp = max(tmp, t[self.get_id(1, _j + 1, _i)] + comm + _cost)
-            if f[_j] > 0:
-                stage_bubble[_j] += tmp - no_bubble
-            e[_j] = tmp
-            t[_id] = tmp
-            m[_j] += _mem
-            order[_id] = o[_j]
-            if type_k == 0:
-                f[_j] += 1
-            elif type_k == 1:
-                b[_j] += 1
-            else:
-                w[_j] += 1
-            o[_j] += 1
-            fbw = "fbw"
-            order_str[stage_j] += fbw[type_k]
-        for i in range(self.nmb):
-            if i == 0:
-                for j in range(self.nstages):
-                    put(j, 0)
-                f_required = [0] * self.nstages
-                last_t = 0
-                for j in range(self.nstages - 1, -1, -1):
-                    if j == self.nstages - 1:
-                        last_t = t[self.get_id(0, j, i)] + self.get_cost(self.get_id(1, j, i))
-                        continue
-                    mem = m[j]
-                    cost = e[j]
-                    while True:
-                        f_id = self.get_id(0, j, f[j] + f_required[j])
-                        if f[j] + f_required[j] < self.nmb and mem + self.get_mem(f_id) <= max_mem:
-                            if allow_bubble_before_first_b:
-                                if cost + self.get_cost(f_id) > last_t + comm:
-                                    break
-                            else:
-                                if cost >= last_t + comm:
-                                    break
-                            mem += self.get_mem(f_id)
-                            cost += self.get_cost(f_id)
-                            f_required[j] += 1
-                        else:
-                            break
-                    last_t = max(cost, last_t + comm) + self.get_cost(self.get_id(1, j, i))
-                for j in range(self.nstages):
-                    while j > 0 and f_required[j] > 0 and f_required[j] >= f_required[j - 1] and f[j] + f_required[j] < self.nmb:
-                        f_required[j] -= 1
-                for j in range(self.nstages - 1, -1, -1):
-                    for _ in range(f_required[j]):
-                        put(j, 0)
-                    put(j, 1)
-                continue
-            f_required = [0] * self.nstages
-            for j in range(self.nstages):
-                if f[j] >= self.nmb:
-                    continue
-                if j + 1 < self.nstages and f[j] >= f[j + 1] + 2 and prioritize_b:
-                    next_plus_fw = (
-                        e[j + 1]
-                        + self.get_cost(self.get_id(0, j + 1, f[j + 1]))
-                        + self.get_cost(self.get_id(1, j + 1, b[j + 1]))
-                        + comm
-                    )
-                    if e[j] >= next_plus_fw:
-                        continue
-                    f_id = self.get_id(0, j, f[j])
-                    f_mem = self.get_mem(f_id)
-                    w_cost, w_cnt = 0, 0
-                    mem_with_w = m[j] + f_mem
-                    while mem_with_w > max_mem and w[j] + w_cnt < b[j]:
-                        w_id = self.get_id(2, j, w[j] + w_cnt)
-                        w_cost += self.get_cost(w_id)
-                        mem_with_w += self.get_mem(w_id)
-                        w_cnt += 1
-                    if e[j] + self.get_cost(f_id) + w_cost <= next_plus_fw:
-                        f_required[j] = 1
-                        continue
-                    w_cost, w_cnt = 0, 0
-                    # mem_with_w = m[j]
-                    # while w[j] + w_cnt < b[j]:
-                    #     w_id = self.get_id(2, j, w[j] + w_cnt)
-                    #     w_cost += self.get_cost(w_id)
-                    #     mem_with_w += self.get_mem(w_id)
-                    #     w_cnt += 1
-                    # if e[j] + w_cost >= next_plus_fw:
-                    #     continue
-                    if next_plus_fw - (e[j] + w_cost) <= get_max_bubble() - stage_bubble[j]:
-                        # TODO: can sample here
-                        continue
-                f_required[j] = 1
-            for j in range(self.nstages - 2, -1, -1):
-                f_required[j] = min(f_required[j], f_required[j + 1])
-            for j in range(self.nstages):
-                if f_required[j] == 0:
-                    continue
-                f_id = self.get_id(0, j, f[j])
-                mem = self.get_mem(f_id)
-                while m[j] + mem > max_mem:
-                    if w[j] >= b[j]:
-                        raise ValueError("Cannot fit memory")
-                    put(j, 2)
-                if j > 0:
-                    while (
-                        w[j] < b[j]
-                        and e[j] + self.get_cost(self.get_id(2, j, w[j]))
-                        <= t[self.get_id(0, j - 1, f[j])] + comm
-                    ):
-                        put(j, 2)
-                    if w[j] < b[j] and e[j] < t[self.get_id(0, j - 1, f[j])] + comm:
-                        # TODO: e[j] + self.get_cost(self.get_id(2, j, w[j])) > t[self.get_id(0, j - 1, f[j])] + comm
-                        if (
-                            t[self.get_id(0, j - 1, f[j])] + comm - e[j]
-                            <= get_max_bubble() - stage_bubble[j]
-                        ):
-                            # TODO: can sample here
-                            if no_bubble_greedy:
-                                put(j, 2)
-                        else:
-                            put(j, 2)
-                put(j, 0)
-            for j in range(self.nstages - 1, -1, -1):
-                assert b[j] == i
-                b_id = self.get_id(1, j, b[j])
-                mem = self.get_mem(b_id)
-                while m[j] + mem > max_mem:
-                    if w[j] >= b[j]:
-                        raise ValueError("Cannot fit memory")
-                    put(j, 2)
-                if j + 1 < self.nstages:
-                    while (
-                        w[j] < b[j]
-                        and e[j] + self.get_cost(self.get_id(2, j, w[j]))
-                        <= t[self.get_id(1, j + 1, i)] + comm
-                    ):
-                        put(j, 2)
-                    if w[j] < b[j] and e[j] < t[self.get_id(1, j + 1, i)] + comm:
-                        # TODO: e[j] + self.get_cost(self.get_id(2, j, w[j])) > t[self.get_id(1, j + 1, i)] + comm
-                        if (
-                            t[self.get_id(1, j + 1, i)] + comm - e[j]
-                            <= get_max_bubble() - stage_bubble[j]
-                        ):
-                            # TODO: can sample here
-                            if no_bubble_greedy:
-                                put(j, 2)
-                        else:
-                            put(j, 2)
-                if j == 0 and f[j] == self.nmb:
-                    while w[j] < b[j]:
-                        put(j, 2)
-                put(j, 1)
-        for i in range(self.nstages):
-            while w[i] < self.nmb:
-                put(i, 2)
-            # print(f"{' ' * i}{order_str[i]}  -> {e[i]}")
-        for i in range(self.nstages):
-            for j in range(self.nmb):
-                f_id = self.get_id(0, i, j)
-                b_id = self.get_id(1, i, j)
-                w_id = self.get_id(2, i, j)
-                f_cost = self.get_cost(f_id)
-                b_cost = self.get_cost(b_id)
-                w_cost = self.get_cost(w_id)
-                assert t[b_id] >= t[f_id] + b_cost
-                assert t[w_id] >= t[b_id] + w_cost, f"{i}-{j}, {t[w_id]} >= {t[b_id]} + {w_cost}"
-                if i > 0:
-                    assert t[f_id] >= t[self.get_id(0, i - 1, j)] + comm + f_cost, f"{i}-{j}"
-                if i < self.nstages - 1:
-                    assert t[b_id] >= t[self.get_id(1, i + 1, j)] + comm + b_cost
-        # print(order)
-        best_time = 0
-        for i in range(self.nstages):
-            time_i = (
-                t[self.get_id(2, i, self.nmb - 1)]
-                - t[self.get_id(0, i, 0)]
-                + self.get_cost(self.get_id(0, i, 0))
-            )
-            best_time = max(best_time, time_i)
-        return order, t, best_time
-def initial_solution(graph):
-    best_time, order, complete_time = None, None, None
-    for allow_bubble_before_first_b in [True, False]:
-        for prioritize_b in [True, False]:
-            for no_bubble_greedy in [True, False]:
-                order_t, complete_time_t, best_time_t = graph.manual_order(
-                    allow_bubble_before_first_b=allow_bubble_before_first_b,
-                    prioritize_b=prioritize_b,
-                    no_bubble_greedy=no_bubble_greedy,
-                )
-                if best_time is None or best_time_t < best_time:
-                    best_time = best_time_t
-                    order = order_t
-                    complete_time = complete_time_t
-    print_detail(graph, complete_time)
-    print("-" * 20, best_time, "-" * 20)
-    return best_time, order, complete_time
-def print_detail(graph, F):
-    typenames = ['F', 'B', 'W']
-    times = []
-    for stage in range(graph.nstages):
-        stage_str = ['.'] * int(F[graph.get_id(2, stage, graph.nmb - 1)] / graph.config.print_scaling)
-        for _type in range(3):
-            for _mb in range(graph.nmb):
-                _id = graph.get_id(_type, stage, _mb)
-                end = int(F[_id] / graph.config.print_scaling)
-                start = int((F[_id] - graph.get_cost(_id)) / graph.config.print_scaling)
-                for j in range(start, end):
-                    if j == start or j == end - 1:
-                        stage_str[j] = typenames[_type]
-                    elif j == start + 1:
-                        if _mb >= 10:
-                            stage_str[j] = str(_mb // 10)
-                        else:
-                            stage_str[j] = str(_mb)
-                    elif j == start + 2 and _mb >= 10:
-                        stage_str[j] = str(_mb % 10)
-                    else:
-                        stage_str[j] = "-"
-        _str = ""
-        for _c in stage_str:
-            _str += _c
-        times.append(
-            F[graph.get_id(2, stage, graph.nmb - 1)]
-            - F[graph.get_id(0, stage, 0)]
-            + graph.get_cost(graph.get_id(0, stage, 0))
-        )
-        print(_str)
-    print('Longest stage time: ', max(times))
-def ilp_results(graph, F):
-    typenames = ['F', 'B', 'W']
-    local_order = []
-    end_time = []
-    for i in range(graph.nnodes):
-        end_time.append(F[i])
-    for stage in range(graph.nstages):
-        order = []
-        for type in range(3):
-            for mb in range(graph.nmb):
-                id = graph.get_id(type, stage, mb)
-                order.append(
-                    ScheduledNode(
-                        type=typenames[type],
-                        stage=stage,
-                        minibatch=mb,
-                        start_time=end_time[id] - graph.get_cost(id),
-                        completion_time=F[id],
-                    )
-                )
-        local_order.append(order)
-    # For each F/B, append a send/recv node. The timestamp of recv node is the same as send node to guarrentee a global order.
-    comm_id = {}
-    comm_id_counter = 0
-    post_validation_time = 0
-    for i in range(graph.nstages - 1, -1, -1):
-        warmup_f_count = -1
-        first_b_end = end_time[graph.get_id(1, i, 0)]
-        for j in range(graph.nmb):
-            if end_time[graph.get_id(0, i, j)] < first_b_end:
-                warmup_f_count += 1
-        assert warmup_f_count >= 0
-        pv_id = warmup_f_count
-        _id = graph.get_id(0, i, pv_id)
-        _cost = graph.get_cost(_id)
-        post_validation_time = max(post_validation_time, end_time[_id] - _cost - graph.config.cost_comm)
-        # post_validation_time = 0
-        # print(i, pv_id, post_validation_time)
-        for it in ["RECV_", "SEND_", ""]:
-            if i == 0 and it == "SEND_":
-                continue
-            if i == graph.nstages - 1 and it == "RECV_":
-                continue
-            # stage_ = i - 1 if it == "RECV_" else i
-            stage_ = i
-            local_order[stage_].append(ScheduledNode(
-                type=it + "POST_VALIDATION",
-                stage=stage_,
-                minibatch=0,
-                start_time=post_validation_time,
-                completion_time=post_validation_time,
-            ))
-            comm_id[local_order[stage_][-1]] = comm_id_counter
-            comm_id_counter += 1
-    for stage in range(graph.nstages):
-        for node in local_order[stage]:
-            if node.type == 'F' and node.stage != graph.nstages - 1:
-                local_order[stage].append(
-                    ScheduledNode(
-                        type='SEND_FORWARD',
-                        stage=stage,
-                        minibatch=node.minibatch,
-                        start_time=node.completion_time,
-                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
-                    )
-                )
-                local_order[stage + 1].append(
-                    ScheduledNode(
-                        type='RECV_FORWARD',
-                        stage=stage + 1,
-                        minibatch=node.minibatch,
-                        start_time=node.completion_time,
-                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
-                    )
-                )
-                comm_id[local_order[stage][-1]] = comm_id_counter
-                comm_id[local_order[stage + 1][-1]] = comm_id_counter
-                comm_id_counter += 1
-            if node.type == 'B' and node.stage != 0:
-                local_order[stage].append(
-                    ScheduledNode(
-                        type='SEND_BACKWARD',
-                        stage=stage,
-                        minibatch=node.minibatch,
-                        start_time=node.completion_time,
-                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
-                    )
-                )
-                local_order[stage - 1].append(
-                    ScheduledNode(
-                        type='RECV_BACKWARD',
-                        stage=stage - 1,
-                        minibatch=node.minibatch,
-                        start_time=node.completion_time,
-                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
-                    )
-                )
-                comm_id[local_order[stage][-1]] = comm_id_counter
-                comm_id[local_order[stage - 1][-1]] = comm_id_counter
-                comm_id_counter += 1
-    for stage in range(graph.nstages):
-        # For nodes with the same timestamp on the same stage, communication will be prioritized.
-        def even_breaker(x: ScheduledNode):
-            # Compute nodes are always delayed.
-            if x.type in ['F', 'B', 'W']:
-                return comm_id_counter
-            # For comm nodes, order by their unique comm id
-            return comm_id[x]
-        local_order[stage] = list(sorted(
-            local_order[stage], key=lambda x: (x.start_time, even_breaker(x))
-        ))
-        # If a recv with intersects with previous computation, reorder them so that recv
-        # is executed before computation and hence can be overlapped.
-        for i in range(len(local_order[stage])):
-            if i > 0 and local_order[stage][i - 1].type in {'F', 'B', 'W'} and \
-                local_order[stage][i].type.startswith('RECV') and \
-                "POST_VALIDATION" not in local_order[stage][i].type and \
-                local_order[stage][i].start_time <= local_order[stage][i - 1].completion_time:
-                (local_order[stage][i], local_order[stage][i - 1]) = (local_order[stage][i - 1], local_order[stage][i])
-        # print([(x.type, x.start_time, x.completion_time) for x in local_order[stage]])
-    local_order_with_rollback = [[] for _ in range(graph.nstages)]
-    for rank in range(graph.nstages):
-        rollback_comm = set()
-        if rank > 0:
-            for node in local_order[rank - 1]:
-                if node.type == "POST_VALIDATION":
-                    break
-                if node.type == "SEND_FORWARD":
-                    rollback_comm.add(node.minibatch)
-        for node in local_order[rank]:
-            if node.type == "RECV_FORWARD" and node.minibatch in rollback_comm:
-                rollback = True
-                rollback_comm.remove(node.minibatch)
-            else:
-                rollback = False
-            local_order_with_rollback[rank].append(ScheduledNode(
-                type=node.type,
-                stage=node.stage,
-                minibatch=node.minibatch,
-                start_time=node.start_time,
-                completion_time=node.completion_time,
-                rollback=rollback,
-            ))
-        assert len(rollback_comm) == 0
-        # for node in local_order_with_rollback[rank]:
-        #     print(f"{node.type}-{node.minibatch}-{int(node.rollback)}", end=', ')
-        # print()
-    print_detail(graph, end_time)
-    return local_order_with_rollback
-def auto_schedule(nstages, nmb, config):
-    graph = Graph.build_graph(nstages, nmb, config)
-    best_time, order, complete_time = initial_solution(graph)
-    return ilp_results(graph, complete_time)
-if __name__ == "__main__":
-    # auto_schedule(4, 12, GraphConfig(cost_f=5, cost_b=6, cost_w=4, cost_comm=0, max_mem=10))
-    # auto_schedule(4, 12, GraphConfig(cost_f=5, cost_b=6, cost_w=4, cost_comm=0, max_mem=14))
-    auto_schedule(24, 72, GraphConfig(cost_f=5, cost_b=6, cost_w=4, cost_comm=0, max_mem=100))
-    auto_schedule(4, 12, GraphConfig(
-        cost_f=5478,
-        cost_b=5806,
-        cost_w=3534,
-        cost_comm=200,
-        max_mem=32,
-        print_scaling=1000
-    ))
-    auto_schedule(32, 16, GraphConfig(
-        cost_f=1,
-        cost_b=1,
-        cost_w=1,
-        cost_comm=0,
-        max_mem=64,
-    ))

description1.md CHANGED Viewed

@@ -1,11 +1,5 @@
-# Zero Bubble Pipeline Parallelism
-Zero Bubble Pipeline Parallelism is a novel pipeline parallelism algorithm able to reduce the bubble of pipeline parallelism to almost zero while preserving synchronous semantics.
-Check out our paper at:
-* [Arxiv Version with ZBV](https://arxiv.org/abs/2401.10241)
-* [ICLR Accepted version with ZB1P and ZB2P](https://openreview.net/pdf?id=tuzTN0eIO5)
-Try out our implementation based on Megatron on [https://github.com/sail-sg/zero-bubble-pipeline-parallelism](https://github.com/sail-sg/zero-bubble-pipeline-parallelism)
-Experiments shows zero bubble pipeline parallelism can accelerate training up to 30% with a similar memory comsumption. A detailed table of experiments is coming soon.

+# Pipeline Parallellism with Controllable Memory
+Check out our paper at [Arxiv](https://arxiv.org/abs/2405.15362).
+Bubble Rate here is calculated as (1 - longest stage time/(F+B+W)/m).

description2.md CHANGED Viewed

@@ -1,33 +1,6 @@
-## Zero Bubble Schedules
-The key of achieving zero bubble is to breaking a backward pass into a B pass and W pass. B on one stage will only depend on the B on its next stage, compared to depending on both B and W of in 1F1B.
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/63510eea0b94548566dad923/8B9thyMiLgysNi_m_O3Qn.png)
-### Comparision of Schedules
-* 1F1B
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/63510eea0b94548566dad923/Q3yxf4BQIESQ_M7lKKlhf.png)
-* ZB1P
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/63510eea0b94548566dad923/EcTFvbjfM7soUXDYyn1Xu.png)
-* ZB2P
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/63510eea0b94548566dad923/8jFI_rO69BREKqiSFHIOL.png)
-* ZBV - Each device is assigned to exactly 2 chunks (virtual stages), where white text colors represent the first chunk and black text colors represent the second chunk. The sequence of dependencies among model chunks follows a ”V” shape pattern for both the forward and backward passes.
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/63510eea0b94548566dad923/VRfjNVXakAU3MQK3h6OKa.png)
-| Comparison assuming T_F=T_B=T_W                                                      | 1F1B    | ZB1P     | ZB2P | ZBV (Recommended) |
-| ----------------------------------------------------- | ------- | -------- | ---- | --- |
-| Bubble Rate                                           | (p-1)/m | (p-1)/3m | 0    | 0   |
-| Activation Memory <br> (Compared to 1F1B)             | 1x       | 1x        | 2x    | 1x   |
-| Pipeline Communication Volume <br> (Compared to 1F1B) | 1x       | 1x        | 1x    | 2x   |
-## Optimizer Post Validation
-In most practices of PP there's an all-reduce cross all pipeline stages for numerical robustness, e.g. global gradient norm for gradient clipping. INF/NAN check for mixed precision training, etc. This all-reduce breaks parallelogram and makes zero bubble impossible.
-Under the observation that during a stable training both the gradient clipping and INF/NAN rarely triggers, we replace the before-hand synchronizations with a post update validation.
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/63510eea0b94548566dad923/hRPFqaFxJ20wm2omwyKmO.png)
-We eagerly step the optimizers assuming the grad cliping, INF/NAN conditions are not triggered. In case an amendment to the gradient is required, a rollback will be issued and then we redo the optimizer step based on the fully reduced global state.

+## Alternative schedules
+By utilizing the building block, we can search for different types of schedules depending on the need. We illustrate few of them here below:
+* 1F1B-V schedule without doing any B-W split.
+* Schedule with 2/3rd 1F1B memory by utilising B-W split. Note that two microbatches are included in a single building block to avoid collision.
+* Variation of interleaved 1F1B with lower memory

interleaved_variant.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from dataclasses import dataclass
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    chunk: int
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+def get_interleaved_variation(_p, _n, cost):
+    _f, _b, _w, _c = cost
+    schedule = []
+    local_prev = {}
+    f_order = []
+    b_order = []
+    left = [_n, _n]
+    for id in range(min(_n, _p)):
+        f_order.append(('F', id))
+    for id in range(min(_n, _p)):
+        f_order.append(('f', id))
+    left = [max(0, _n - _p), max(0, _n - _p)]
+    i = 0
+    cur = 0
+    for id in range(min(_n, _p)):
+        b_order.append(('B', id))
+    while left[0] > 0 or left[1] > 0:
+        if i >= _p and left[1 - cur] > 0:
+            cur = 1 - cur
+        if left[cur] > 0:
+            if cur == 0:
+                f_order.append(('F', _n - left[cur]))
+                b_order.append(('b', _n - left[cur] - _p))
+            else:
+                f_order.append(('f', _n - left[cur]))
+                b_order.append(('B', _n - left[cur]))
+            left[cur] -= 1
+        i += 3
+    for id in range(min(_n, _p)):
+        b_order.append(('b', _n - _p + id))
+    for stage in range(_p):
+        diff = min(_p + _p - stage, len(f_order))
+        stage_schedule = []
+        for i in range(diff):
+            stage_schedule.append(f_order[i])
+        for i in range(len(f_order) - diff):
+            stage_schedule.append(b_order[i])
+            stage_schedule.append(f_order[i + diff])
+        for i in range(diff):
+            stage_schedule.append(b_order[len(b_order) - diff + i])
+        for i in range(len(stage_schedule) - 1):
+            local_prev[(stage, *stage_schedule[i + 1])] = (stage, *stage_schedule[i])
+        schedule.append(stage_schedule)
+        # print(stage_schedule)
+    # return None
+    cost = {
+        'F': _f,
+        'f': _f,
+        'B': _b+_w,
+        'b': _b+_w
+    }
+    pred = {
+        'f': 'F',
+        'B': 'f',
+        'b': 'B'
+    }
+    time_map = {}
+    def get_time(stage, type, minibatch):
+        if (stage, type, minibatch) in time_map:
+            return time_map.get((stage, type, minibatch))
+        time = 0
+        if (stage, type, minibatch) in local_prev:
+            time = get_time(*local_prev[(stage, type, minibatch)])
+        if stage > 0 and type in ('F', 'f'):
+            time = max(time, get_time(stage - 1, type, minibatch) + _c)
+        if stage == 0 and type in ('f'):
+            time = max(time, get_time(_p - 1, pred[type], minibatch) + _c)
+        if stage != _p - 1 and type in ('B', 'b'):
+            time = max(time, get_time(stage + 1, type, minibatch) + _c)
+        if stage == _p - 1 and type in ('b'):
+            time = max(time, get_time(0, pred[type], minibatch) + _c)
+        if stage == _p - 1 and type in ('B'):
+            time = max(time, get_time(stage, pred[type], minibatch))
+        time_map[(stage, type, minibatch)] = time + cost[type]
+        return time_map[(stage, type, minibatch)]
+    result = []
+    for sid, stage in enumerate(schedule):
+        result_stage = []
+        for type, minibatch in stage:
+            result_stage.append(ScheduledNode(
+                type.upper(),
+                type in ('f', 'B', 'W'),
+                sid,
+                minibatch,
+                get_time(sid, type, minibatch) - cost[type],
+                get_time(sid, type, minibatch)
+            ))
+        result.append(result_stage)
+    return result

schedule1f1bv.py ADDED Viewed

	@@ -0,0 +1,271 @@

+pattern_size = 6
+from collections import Counter
+from dataclasses import dataclass
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    chunk: int
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+def transform_schedule(schedule, f, b, w, c):
+    result = []
+    stage_order = []
+    local_prev = {}
+    stages = len(schedule)
+    for sid, stage in enumerate(schedule):
+        counter = Counter()
+        order = []
+        for p in stage:
+            if not p.strip():
+                continue
+            mb = counter.get(p, 0)
+            if order:
+                local_prev[(sid, p, mb)] = order[-1]
+            order.append((p, mb))
+            counter.update(p)
+        stage_order.append(order)
+    nmb = max(counter.values())
+    time_map = {}
+    cost = {
+        'F': f,
+        'B': b + w,
+        'f': f,
+        'b': b + w,
+    }
+    def get_time(stage, type, mb):
+        if (stage, type, mb) in time_map:
+            return time_map.get((stage, type, mb))
+        time = 0
+        if (stage, type, mb) in local_prev:
+            time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in ('F', 'B') and stage > 0:
+            time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in ('f', 'b') and stage + 1< len(schedule):
+            time = max(time, get_time(stage + 1, type, mb) + c)
+        time_map[(stage, type, mb)] = time + cost[type]
+        return time_map[(stage, type, mb)]
+    r = 0
+    for sid, stage in enumerate(schedule):
+        r = max(get_time(sid, 'b', nmb - 1) - get_time(sid, 'F', 0) + f, r)
+    for sid, stage in enumerate(stage_order):
+        result_stage = []
+        for p, mb in stage:
+            result_stage.append(ScheduledNode(
+                p.upper(),
+                p in ('f', 'B', 'W'),
+                sid,
+                mb,
+                get_time(sid, p, mb) - cost[p],
+                get_time(sid, p, mb)
+            )
+            )
+        result.append(result_stage)
+    return result
+def get_pattern_str(pos):
+    pattern = [" "] * pattern_size
+    notations = "FfBbWw"
+    for i, v in enumerate(pos):
+        if v < 0:
+            continue
+        pattern[v] = notations[i]
+    _str = ""
+    for v in pattern:
+        _str += v
+    return _str
+def init_repeated_schedule(p, m, patterns):
+    repeated = []
+    _len = 4 * p + m + 1
+    for i in range(p):
+        str_i = get_pattern_str(patterns[i]) * _len
+        repeated_i = []
+        for v in str_i:
+            repeated_i.append(v)
+        repeated.append(repeated_i)
+    return repeated
+def clear_invalid(repeated, stage, pos, offset=-1):
+    while 0 <= pos < len(repeated[stage]):
+        repeated[stage][pos] = ' '
+        pos += offset * pattern_size
+    return repeated
+def clear_invalid_index(repeated, m):
+    p = len(repeated)
+    index = pattern_size
+    for identifier in ['F', 'f', 'B', 'b']:
+        if identifier in ['F', 'B']:
+            _iter = range(p)
+        else:
+            _iter = range(p - 1, -1, -1)
+        for i in _iter:
+            for j in range(pattern_size):
+                if repeated[i][index] == identifier:
+                    clear_invalid(repeated, i, index - pattern_size, offset=-1)
+                    clear_invalid(repeated, i, index + pattern_size * m, offset=1)
+                    index += 1
+                    if identifier in ['B', 'b']:
+                        w_identifier = {'B': 'W', 'b': 'w'}[identifier]
+                        for k in range(pattern_size):
+                            if repeated[i][index + k] == w_identifier:
+                                clear_invalid(repeated, i, index + k - pattern_size, offset=-1)
+                                clear_invalid(repeated, i, index + k + pattern_size * m, offset=1)
+                                break
+                    break
+                index += 1
+    return repeated
+def process_warmup_without_increasing_peak_mem(schedules, m):
+    peak_mem = 0
+    mem = [[0 for _ in range(len(schedules[0]))] for _ in range(len(schedules))]
+    loc = [[{key: -1 for key in ('F', 'f', 'B', 'b', 'W', 'w')} for _ in range(m + 2)] for _ in range(len(schedules))]
+    cntr = [{key: 0 for key in ('F', 'f', 'B', 'b', 'W', 'w')} for _ in range(len(schedules))]
+    for sid in range(len(schedules)):
+        cur = 0
+        for i in range(len(schedules[sid])):
+            if schedules[sid][i] in ('F', 'f'):
+                cur += 1
+            if schedules[sid][i] in ('W', 'w'):
+                cur -= 1
+            mem[sid][i] = cur
+            peak_mem = max(peak_mem, cur)
+    for i in range(len(schedules[0])):
+        for sid in range(len(schedules)):
+            if schedules[sid][i] == ' ':
+                continue
+            cntr[sid][schedules[sid][i]] += 1
+            cnt = cntr[sid][schedules[sid][i]]
+            pos = -1
+            if cnt > 1:
+                pos = loc[sid][cnt - 1][schedules[sid][i]]
+            if schedules[sid][i] == 'W':
+                pos = max(pos, loc[sid][cnt]['B'])
+            if schedules[sid][i] == 'w':
+                pos = max(pos, loc[sid][cnt]['b'])
+            if schedules[sid][i] == 'F' and sid > 0:
+                pos = max(pos, loc[sid - 1][cnt]['F'])
+            if schedules[sid][i] == 'f':
+                if sid != len(schedules) - 1:
+                    pos = max(pos, loc[sid + 1][cnt]['f'])
+                else :
+                    pos = max(pos, loc[sid][cnt]['F'])
+            if schedules[sid][i] == 'B':
+                if sid != 0:
+                    #Because B and W are always combined
+                    pos = max(pos, loc[sid - 1][cnt]['W'])
+                else :
+                    pos = max(pos, loc[sid][cnt]['f'])
+            if schedules[sid][i] == 'b':
+                if sid != len(schedules) - 1:
+                    #Because B and W are always combined
+                    pos = max(pos, loc[sid + 1][cnt]['w'])
+                else :
+                    pos = max(pos, loc[sid][cnt]['W'])
+            pos += 1
+            while schedules[sid][pos] != ' ' and pos < i:
+                pos += 1
+            if schedules[sid][i] in ('B', 'b'):
+                while pos < i and (schedules[sid][pos] != ' ' or schedules[sid][pos + 1] != ' '):
+                    pos += 1
+            if pos == i:
+                loc[sid][cnt][schedules[sid][i]] = i
+                continue
+            if schedules[sid][i] in ('B', 'b', 'W', 'w'):
+                schedules[sid][pos] = schedules[sid][i]
+                schedules[sid][i] = ' '
+                if schedules[sid][pos] in ('W', 'w'):
+                    for j in range(pos, i):
+                        mem[sid][j] -= 1
+                loc[sid][cnt][schedules[sid][pos]] = pos
+                continue
+            #If F or f:
+            place = i
+            while place > pos and mem[sid][place - 1] < peak_mem:
+                place -= 1
+            while place < i and schedules[sid][place] != ' ':
+                place += 1
+            if place == i:
+                loc[sid][cnt][schedules[sid][i]] = i
+                continue
+            pos = place
+            schedules[sid][pos] = schedules[sid][i]
+            schedules[sid][i] = ' '
+            for j in range(pos, i):
+                mem[sid][j] += 1
+            loc[sid][cnt][schedules[sid][pos]] = pos
+    return schedules
+def schedule_by_pattern(p, m, patterns):
+    schedules = init_repeated_schedule(p, m, patterns)
+    schedules = clear_invalid_index(schedules, m)
+    schedules = process_warmup_without_increasing_peak_mem(schedules,  m)
+    for sid in range(len(schedules)):
+        cnt = {_id: 0 for _id in "FfBbWw"}
+        for i in range(len(schedules[sid])):
+            if(schedules[sid][i] == ' '):
+                continue
+            if cnt[schedules[sid][i]] >= m:
+                schedules[sid][i] = ' '
+            else:
+                cnt[schedules[sid][i]] += 1
+    return schedules
+def create_whole_pattern(p):
+    whole_pattern = [[0 for _ in range(6)] for _ in range(p)]
+    now = 0
+    for i in range(p):
+        now += 1
+        whole_pattern[i][0] = now
+    for i in range(p):
+        now += 1
+        whole_pattern[p - 1 - i][1] = now
+    now += 1
+    if p % 3 == 0:
+        now += 3
+    cyc = (3 - (p + 2) % 3) % 3
+    for i in range(p):
+        whole_pattern[i][2], whole_pattern[i][4] = now, now + 1
+        cyc += 1
+        now += 2
+        if(cyc == 3):
+            cyc = 0
+            now += 3
+    for i in range(p):
+        whole_pattern[p - 1 - i][3], whole_pattern[p - 1 - i][5] = now, now + 1
+        cyc += 1
+        now += 2
+        if(cyc == 3):
+            cyc = 0
+            now += 3
+    for sid in range(p):
+        for i in range(6):
+            whole_pattern[sid][i] %= 6
+    return whole_pattern
+def schedule(p, m, cost):
+    whole_pattern = create_whole_pattern(p)
+    s = schedule_by_pattern(p, m, whole_pattern)
+    for sid in range(len(s)):
+        for i in range(len(s[sid])):
+            if s[sid][i] in ('W', 'w'):
+                s[sid][i] = ' '
+    res = transform_schedule(s, *cost)
+    return res

svg_event.py CHANGED Viewed

@@ -234,7 +234,7 @@ def plot_events(ctx, events, title_text: str, canvas_info: CanvasInfo, include_w
             if ENABLE_BATCH_ID:
                 minibatch = str(e["minibatch"])
                 center = (start + end) // 2
-                data_ctx.text(h, center, minibatch, font_scale=0.6, fill='black' if e["chunk"] == 0 else 'white')
         if ENABLE_BORDER:
             data_ctx.line(h+SPAN_HEIGHT, 0, h+SPAN_HEIGHT+BORDER_SIZE, max_len - 1)

             if ENABLE_BATCH_ID:
                 minibatch = str(e["minibatch"])
                 center = (start + end) // 2
+                data_ctx.text(h, center, minibatch, font_scale=0.6, fill='white' if e["chunk"] == 0 else 'black')
         if ENABLE_BORDER:
             data_ctx.line(h+SPAN_HEIGHT, 0, h+SPAN_HEIGHT+BORDER_SIZE, max_len - 1)

type2.py ADDED Viewed

	@@ -0,0 +1,163 @@

+pattern_size = 6
+from collections import Counter
+from dataclasses import dataclass
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+def transform_schedule(schedule, f, b, w, c):
+    result = []
+    stage_order = []
+    local_prev = {}
+    stages = len(schedule)
+    for sid, stage in enumerate(schedule):
+        counter = Counter()
+        order = []
+        for p in stage:
+            if not p.strip():
+                continue
+            mb = counter.get(p, 0)
+            if order:
+                local_prev[(sid, p, mb)] = order[-1]
+            order.append((p, mb))
+            counter.update(p)
+        stage_order.append(order)
+    nmb = max(counter.values())
+    time_map = {}
+    cost = {
+        'F': f,
+        'B': b,
+        'W': w,
+    }
+    def get_time(stage, type, mb):
+        if (stage, type, mb) in time_map:
+            return time_map.get((stage, type, mb))
+        time = 0
+        if (stage, type, mb) in local_prev:
+            time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in ('F') and stage > 0:
+            time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in ('B') and stage + 1< len(schedule):
+            time = max(time, get_time(stage + 1, type, mb) + c)
+        # print(f'{stage} {type}:{mb}', time + cost[type])
+        time_map[(stage, type, mb)] = time + cost[type]
+        return time_map[(stage, type, mb)]
+    r = 0
+    for sid, stage in enumerate(schedule):
+        r = max(get_time(sid, 'W', nmb - 1) - get_time(sid, 'F', 0) + f, r)
+    for sid, stage in enumerate(stage_order):
+        result_stage = []
+        for p, mb in stage:
+            result_stage.append(ScheduledNode(
+                p.upper(),
+                sid,
+                mb,
+                get_time(sid, p, mb) - cost[p],
+                get_time(sid, p, mb)
+            )
+            )
+        result.append(result_stage)
+    return result
+def process_warmup_without_increasing_peak_mem(schedules, m):
+    peak_mem = 0
+    mem = [[0 for _ in range(len(schedules[0]))] for _ in range(len(schedules))]
+    loc = [[{key: -1 for key in ('F', 'B', 'W')} for _ in range(m + 2)] for _ in range(len(schedules))]
+    cntr = [{key: 0 for key in ('F', 'B', 'W')} for _ in range(len(schedules))]
+    for sid in range(len(schedules)):
+        cur = 0
+        for i in range(len(schedules[sid])):
+            if schedules[sid][i] in ('F'):
+                cur += 1
+            if schedules[sid][i] in ('W'):
+                cur -= 1
+            mem[sid][i] = cur
+            peak_mem = max(peak_mem, cur)
+    for i in range(len(schedules[0])):
+        for sid in range(len(schedules)):
+            if schedules[sid][i] == ' ':
+                continue
+            cntr[sid][schedules[sid][i]] += 1
+            cnt = cntr[sid][schedules[sid][i]]
+            pos = -1
+            if cnt > 1:
+                pos = loc[sid][cnt - 1][schedules[sid][i]]
+            if schedules[sid][i] == 'W':
+                pos = max(pos, loc[sid][cnt]['B'])
+            if schedules[sid][i] == 'F' and sid > 0:
+                pos = max(pos, loc[sid - 1][cnt]['F'])
+            if schedules[sid][i] == 'B':
+                if sid != len(schedules) - 1:
+                    pos = max(pos, loc[sid + 1][cnt]['B'])
+                else :
+                    pos = max(pos, loc[sid][cnt]['F'])
+            pos += 1
+            while schedules[sid][pos] != ' ' and pos < i:
+                pos += 1
+            if pos == i:
+                loc[sid][cnt][schedules[sid][i]] = i
+                continue
+            if schedules[sid][i] in ('B', 'W'):
+                schedules[sid][pos] = schedules[sid][i]
+                schedules[sid][i] = ' '
+                if schedules[sid][pos] in ('W'):
+                    for j in range(pos, i):
+                        mem[sid][j] -= 1
+                loc[sid][cnt][schedules[sid][pos]] = pos
+                continue
+            #If F:
+            if (sid == 0):
+                print(cnt, pos, i)
+            place = i
+            while place > pos and mem[sid][place - 1] < peak_mem:
+                place -= 1
+            while place < i and schedules[sid][place] != ' ':
+                place += 1
+            if place == i:
+                loc[sid][cnt][schedules[sid][i]] = i
+                continue
+            if (sid == 0):
+                print(place)
+            pos = place
+            schedules[sid][pos] = schedules[sid][i]
+            schedules[sid][i] = ' '
+            for j in range(pos, i):
+                mem[sid][j] += 1
+            loc[sid][cnt][schedules[sid][pos]] = pos
+    return schedules
+def schedule(p, m, cost):
+    schedules = [[' ' for _ in range(6 * m + 2 * p + 6)] for _ in range(p)]
+    f_0, f_1, b_0, b_1= p-1, p+1, p, p + 2
+    for sid in range(p - 1, -1, -1):
+        for mid in range((m + 1) // 2):
+            if mid * 2 < m:
+                schedules[sid][f_0 + mid * 6], schedules[sid][b_0 + mid * 6] = 'F', 'B'
+            if mid * 2 + 1 < m:
+                schedules[sid][f_1 + mid * 6], schedules[sid][b_1 + mid * 6] = 'F', 'B'
+        f_0 -= 1
+        f_1 -= 1
+        b_0 += 1
+        b_1 += 1
+        cnt = 0
+        for i in range(len(schedules[0])):
+            if schedules[sid][i] == 'B':
+                cnt += 1
+            if schedules[sid][i] == ' ' and cnt > 0:
+                cnt -= 1
+                schedules[sid][i] = 'W'
+    schedules = process_warmup_without_increasing_peak_mem(schedules, m)
+    res = transform_schedule(schedules, *cost)
+    return res

v_schedule.py DELETED Viewed

@@ -1,474 +0,0 @@
-from collections import deque
-from dataclasses import dataclass
-@dataclass(eq=True, frozen=True)
-class ScheduledNode:
-    type: str
-    chunk: int
-    stage: int
-    minibatch: int
-    start_time: int
-    completion_time: int
-    rollback: bool = False
-class PipelineGraph(object):
-    def __init__(
-        self, n_stage, n_micro, f_cost, b_cost, w_cost, c_cost,
-        f_mem, b_mem, w_mem, max_mem=None,
-    ):
-        self.n_node = 6 * n_stage * n_micro
-        self.n_stage = n_stage
-        self.n_micro = n_micro
-        self.f_cost = f_cost
-        self.b_cost = b_cost
-        self.w_cost = w_cost
-        self.c_cost = c_cost
-        self.f_mem = f_mem
-        self.b_mem = b_mem
-        self.w_mem = w_mem
-        self.fbw_cost = [f_cost, b_cost, w_cost]
-        self.fbw_mem = [f_mem, b_mem, w_mem]
-        self.max_mem = max_mem or f_mem * self.n_stage * 2
-    def get_id(self, cat, chunk, stage, micro):
-        return cat * 2 * self.n_stage * self.n_micro + \
-               chunk * self.n_stage * self.n_micro + \
-               stage * self.n_micro + \
-               micro
-    def try_v_schedule(self, fill_f=True, fill_b=True, approved_bubble=None):
-        count = []
-        for i in range(self.n_stage):
-            count.append([0] * 6)
-        end_time = [-1] * self.n_node
-        cur_time = [0] * self.n_stage
-        mem = [0] * self.n_stage
-        stage_bubble = [0] * self.n_stage
-        pending_w = [deque() for _ in range(self.n_stage)]
-        schedule = [[] for _ in range(self.n_stage)]
-        stage_str = ["    " * i for i in range(self.n_stage)]
-        if approved_bubble is None:
-            approved_bubble = [-1] * self.n_stage
-        max_approved_bubble = max(approved_bubble)
-        def get_max_stage_bubble(stage=-1):
-            max_stage_bubble = 0
-            for bb in stage_bubble:
-                max_stage_bubble = max(max_stage_bubble, bb)
-            if stage >= 0:
-                max_stage_bubble = max(max_stage_bubble, max_approved_bubble - approved_bubble[stage])
-            return max_stage_bubble
-        def put_w(stage):
-            assert len(pending_w[stage]) > 0
-            _, chunk_, _ = pending_w[stage].popleft()
-            put(2, chunk_, stage)
-        def put(cat, chunk, stage, assert_cnt=True):
-            _tmp = _no_bubble = cur_time[stage] + self.fbw_cost[cat]
-            _cnt = count[stage][cat * 2 + chunk]
-            # assert _cnt < self.n_micro
-            if _cnt >= self.n_micro:
-                if not assert_cnt:
-                    stage_str[stage] += "    "
-                    cur_time[stage] = _tmp  # TODO
-                    return
-                assert False
-            assert mem[stage] + self.fbw_mem[cat] <= self.max_mem
-            stage_str[stage] += "FfBbWw"[cat * 2 + chunk] + str(_cnt + 1) + " " * (3 - len(str(_cnt + 1)))
-            if cat > 0 or chunk > 0:
-                last_id = cat * 2 + chunk - 1
-                if cat < 2:
-                    # if end_time[self.get_id(last_id // 2, last_id % 2, stage, _cnt)] < 0:
-                    #     print(cat, chunk, stage, _cnt)
-                    #     self.print_details(end_time)
-                    assert end_time[self.get_id(last_id // 2, last_id % 2, stage, _cnt)] >= 0
-                else:
-                    assert end_time[self.get_id(1, chunk, stage, _cnt)] >= 0
-            if chunk == 1 and cat < 2:
-                if stage < self.n_stage - 1:
-                    _fa_id = self.get_id(cat, chunk, stage + 1, _cnt)
-                    assert end_time[_fa_id] >= 0
-                    _tmp = max(_tmp, end_time[_fa_id] + self.c_cost + self.fbw_cost[cat])
-            if chunk == 0 and cat < 2:
-                if stage > 0:
-                    _fa_id = self.get_id(cat, chunk, stage - 1, _cnt)
-                    # if end_time[_fa_id] < 0:
-                    #     print(cat, chunk, stage, _cnt)
-                    #     self.print_details(end_time)
-                    assert end_time[_fa_id] >= 0, f"{cat}, {chunk}, {stage}, {_cnt}"
-                    _tmp = max(_tmp, end_time[_fa_id] + self.c_cost + self.fbw_cost[cat])
-            _id = self.get_id(cat, chunk, stage, _cnt)
-            if count[stage][0] > 0:
-                stage_bubble[stage] += _tmp - _no_bubble
-            end_time[_id] = _tmp
-            cur_time[stage] = _tmp
-            mem[stage] += self.fbw_mem[cat]
-            # noinspection PyTypeChecker
-            schedule[stage].append((cat, chunk, _cnt))
-            if cat == 1:
-                pending_w[stage].append((2, chunk, _cnt))
-            count[stage][cat * 2 + chunk] += 1
-        # for _ in range(2 * self.n_stage):
-        #     for i in range(self.n_stage):
-        #         if count[i][1] >= count[i][0]:
-        #             put(0, 0, i, assert_cnt=False)
-        #             continue
-        #         if i == self.n_stage - 1:
-        #             put(0, 1, i, assert_cnt=False)
-        #             continue
-        #         fa_id = self.get_id(0, 1, i + 1, count[i][1])
-        #         if 0 <= end_time[fa_id] < cur_time[i + 1]:  # TODO
-        #             put(0, 1, i, assert_cnt=False)
-        #         else:
-        #             put(0, 0, i, assert_cnt=False)
-        for i in range(self.n_stage):
-            put(0, 0, i)
-        for i in range(self.n_stage - 1, -1, -1):
-            if i == self.n_stage - 1:
-                put(0, 1, i)
-                continue
-            tmp = end_time[self.get_id(0, 1, i + 1, 0)] + self.c_cost
-            while mem[i] + self.fbw_mem[0] * (2 + i * 2) <= self.max_mem and cur_time[i] + self.fbw_cost[0] <= tmp and count[i][0] < self.n_micro:
-                for j in range(i + 1):
-                    put(0, 0, j)
-            put(0, 1, i)
-        iter_chunk_ = 0
-        end_tmp = 0
-        for i in range(self.n_stage):
-            if i == 0:
-                end_tmp = cur_time[0] + self.fbw_cost[1]
-                continue
-            tmp = end_tmp + self.c_cost
-            while count[i][0] + count[i][1] < count[i - 1][0] + count[i - 1][1] or count[i][1] <= count[i - 1][1] < self.n_micro:
-                for j in range(self.n_stage - 1, i - 1, -1):
-                    if count[j][iter_chunk_] < self.n_micro:
-                        put(0, iter_chunk_, j)
-                iter_chunk_ = 1 - iter_chunk_
-            # while mem[i] + self.fbw_mem[0] <= self.max_mem and cur_time[i] + self.fbw_cost[0] <= tmp:
-            #     if iter_chunk_ == 0 and count[i][0] >= count[i - 1][0]:
-            #         break
-            #     for j in range(self.n_stage - 1, i - 1, -1):
-            #         if count[j][iter_chunk_] < self.n_micro:
-            #             put(0, iter_chunk_, j)
-            #     iter_chunk_ = 1 - iter_chunk_
-            # end_tmp = max(tmp, cur_time[i]) + self.fbw_cost[1]
-        # init_bubble = get_max_stage_bubble()
-        # print(stage_bubble)
-        for _ in range(2 * self.n_micro):
-            # check mem before putting b
-            for i in range(self.n_stage):
-                while mem[i] + self.fbw_mem[1] > self.max_mem:
-                    assert len(pending_w[i]) > 0
-                    put_w(i)
-            b0_ranks, b1_ranks = [], []
-            for i in range(self.n_stage):
-                if count[i][3] >= count[i][2]:
-                    b0_ranks.append(i)
-                elif i == self.n_stage - 1:
-                    b1_ranks.append(i)
-                else:
-                    fa_id = self.get_id(1, 1, i + 1, count[i][3])
-                    if end_time[fa_id] >= 0 or count[i][2] >= self.n_micro:
-                        b1_ranks.append(i)
-                    else:
-                        b0_ranks.append(i)
-            b_ranks = []
-            # put b1
-            for i in reversed(b1_ranks):
-                b_ranks.append((i, 1))
-            # put b0
-            for i in b0_ranks:
-                b_ranks.append((i, 0))
-            for i, _chunk_ in b_ranks:
-                fa_id = -1
-                if _chunk_ == 1 and i < self.n_stage - 1:
-                    fa_id = self.get_id(1, 1, i + 1, count[i][3])
-                if _chunk_ == 0 and i > 0:
-                    fa_id = self.get_id(1, 0, i - 1, count[i][2])
-                while len(pending_w[i]) > 0 and fa_id >= 0 and end_time[fa_id] + self.c_cost >= cur_time[i] + self.fbw_cost[2]:
-                    # fill the bubble
-                    put_w(i)
-                if len(pending_w[i]) > 0 and end_time[fa_id] + self.c_cost - cur_time[i] > get_max_stage_bubble(i) - stage_bubble[i]:
-                    if _chunk_ == 1:
-                        put_w(i)
-                    elif fill_b:
-                        put_w(i)
-                put(1, _chunk_, i)
-            # put f
-            for i in range(self.n_stage):
-                if count[i][1] >= self.n_micro:
-                    continue
-                put_item = None
-                if count[i][1] >= count[i][0]:
-                    put_item = 0
-                elif i == self.n_stage - 1:
-                    put_item = 1
-                else:
-                    if end_time[self.get_id(0, 1, i + 1, count[i][1])] >= 0:
-                        put_item = 1
-                    elif count[i][0] < self.n_micro:
-                        if i == 0:
-                            put_item = 0
-                        elif end_time[self.get_id(0, 0, i - 1, count[i][0])] >= 0:
-                            put_item = 0
-                if put_item is None:
-                    continue
-                # check mem before putting f
-                while mem[i] + self.fbw_mem[0] > self.max_mem:
-                    assert len(pending_w[i]) > 0
-                    put_w(i)
-                fa_id = -1
-                if put_item == 0 and i > 0:
-                    fa_id = self.get_id(0, 0, i - 1, count[i][0])
-                if put_item == 1 and i < self.n_stage - 1:
-                    fa_id = self.get_id(0, 1, i + 1, count[i][1])
-                while len(pending_w[i]) > 0 and fa_id >= 0 and end_time[fa_id] + self.c_cost >= cur_time[i] + self.fbw_cost[2]:
-                    # fill the bubble
-                    put_w(i)
-                if len(pending_w[i]) > 0 and end_time[fa_id] + self.c_cost - cur_time[i] > get_max_stage_bubble(i) - stage_bubble[i]:
-                    if fill_f:
-                        put_w(i)
-                put(0, put_item, i)
-        for i in range(self.n_stage):
-            while len(pending_w[i]) > 0:
-                put_w(i)
-        # for i in range(self.n_stage):
-        #     print(stage_str[i])
-        max_bubble = get_max_stage_bubble()
-        expected_time = sum(self.fbw_cost) * self.n_micro * 2
-        bubble_rate = max_bubble / expected_time
-        # print("%6.4f" % bubble_rate, "->", stage_bubble)
-        if max_approved_bubble < 0 or max_bubble < max_approved_bubble:
-            _schedule, _end_time, _max_bubble = self.try_v_schedule(
-                fill_f=fill_f, fill_b=fill_b,
-                approved_bubble=stage_bubble,
-            )
-            if _max_bubble < max_bubble:
-                return _schedule, _end_time, _max_bubble
-        # print("%2d %3d, [%5d %5d %5d], %6d -> %6.4f %6.4f" % \
-        #       (self.n_stage, self.n_micro, *self.fbw_cost, self.max_mem // self.f_mem, init_bubble / expected_time, bubble_rate), max_bubble)
-        return schedule, end_time, max_bubble
-    def print_details(self, end_time, print_scaling=1):
-        for stage in range(self.n_stage):
-            stage_str = ['.'] * int(max(end_time) / print_scaling)
-            for _cat in range(3):
-                for _chunk in range(2):
-                    for _micro in range(self.n_micro):
-                        _id = self.get_id(_cat, _chunk, stage, _micro)
-                        if end_time[_id] < 0:
-                            continue
-                        end = int(end_time[_id] / print_scaling)
-                        start = int((end_time[_id] - self.fbw_cost[_cat]) / print_scaling)
-                        for j in range(start, end):
-                            if j == start or j == end - 1:
-                                stage_str[j] = "FfBbWw"[_cat * 2 + _chunk]
-                            elif j == start + 1:
-                                if _micro >= 10:
-                                    stage_str[j] = str(_micro // 10)
-                                else:
-                                    stage_str[j] = str(_micro)
-                            elif j == start + 2 and _micro >= 10:
-                                stage_str[j] = str(_micro % 10)
-                            else:
-                                stage_str[j] = "-"
-            _str = ""
-            for _c in stage_str:
-                _str += _c
-            print(_str)
-    def get_v_schedule(self, only_run_time=False):
-        schedule, end_time, max_bubble = None, None, None
-        expected_time = sum(self.fbw_cost) * self.n_micro * 2
-        for fill_b in [True, False]:
-            for fill_f in [True, False]:
-                _schedule, _end_time, _max_bubble = self.try_v_schedule(
-                    fill_b=fill_b, fill_f=fill_f
-                )
-                # print("")
-                if max_bubble is None or _max_bubble < max_bubble:
-                    max_bubble = _max_bubble
-                    schedule = _schedule
-                    end_time = _end_time
-        if only_run_time:
-            return max_bubble + expected_time
-        # self.print_details(end_time, print_scaling=1)
-        bubble_rate = max_bubble / (expected_time + max_bubble)
-        print("%2d %3d, [%5d %5d %5d %5d], %6d -> %6.4f" % \
-              (self.n_stage, self.n_micro, *self.fbw_cost, self.c_cost, self.max_mem // self.f_mem, bubble_rate))
-        local_order = [[] for _ in range(self.n_stage)]
-        comm_id = {}
-        comm_id_counter = 0
-        post_validation_time = 0
-        for i in range(self.n_stage - 1, -1, -1):
-            pv_id = min(2 * (self.n_stage - 1 - i), self.n_micro - 1)
-            post_validation_time = max(post_validation_time, end_time[self.get_id(0, 0, i, pv_id)] - self.fbw_cost[0] - self.c_cost)
-            # post_validation_time = 0
-            # print(i, pv_id, post_validation_time)
-            for it in ["RECV_", "SEND_", ""]:
-                if i == 0 and it == "SEND_":
-                    continue
-                if i == self.n_stage - 1 and it == "RECV_":
-                    continue
-                # stage_ = i - 1 if it == "RECV_" else i
-                stage_ = i
-                local_order[stage_].append(ScheduledNode(
-                    type=it + "POST_VALIDATION",
-                    chunk=0,
-                    stage=stage_,
-                    minibatch=0,
-                    start_time=post_validation_time,
-                    completion_time=post_validation_time,
-                ))
-                comm_id[local_order[stage_][-1]] = comm_id_counter
-                comm_id_counter += 1
-        for i in range(self.n_stage):
-            for _cat_, _chunk_, _micro_ in schedule[i]:
-                complete_time = end_time[self.get_id(_cat_, _chunk_, i, _micro_)]
-                local_order[i].append(ScheduledNode(
-                    type="FBW"[_cat_],
-                    chunk=_chunk_ if _cat_ == 0 else 1 - _chunk_,
-                    stage=i,
-                    minibatch=_micro_,
-                    start_time=complete_time - self.fbw_cost[_cat_],
-                    completion_time=complete_time,
-                ))
-                if _cat_ == 2: # no communication for W
-                    continue
-                cat_str = "FORWARD" if _cat_ == 0 else "BACKWARD"
-                def communicate(send_recv, stage_):
-                   # noinspection PyTypeChecker
-                    local_order[stage_].append(ScheduledNode(
-                        type=send_recv + cat_str,
-                        chunk=_chunk_ if _cat_ == 0 else 1 - _chunk_,
-                        stage=stage_,
-                        minibatch=_micro_,
-                        start_time=complete_time,
-                        completion_time=complete_time,
-                    ))
-                    comm_id[local_order[stage_][-1]] = comm_id_counter
-                if _chunk_ == 1 and i > 0:
-                    communicate("SEND_", i)
-                    communicate("RECV_", i - 1)
-                if _chunk_ == 0 and i < self.n_stage - 1:
-                    communicate("SEND_", i)
-                    communicate("RECV_", i + 1)
-                comm_id_counter += 1
-        for rank in range(self.n_stage):
-            # For nodes with the same timestamp on the same stage, communication will be prioritized.
-            def even_breaker(x: ScheduledNode):
-                # Compute nodes are always delayed.
-                if x.type in ['F', 'B', 'W']:
-                    return comm_id_counter
-                # For comm nodes, order by their unique comm id
-                return comm_id[x]
-            local_order[rank] = list(sorted(
-                local_order[rank],
-                key=lambda x: (x.start_time, even_breaker(x))
-            ))
-            # If a recv with intersects with previous computation, reorder them so that recv
-            # is executed before computation and hence can be overlapped.
-            for i in range(len(local_order[rank])):
-                if i > 0 and local_order[rank][i - 1].type in {'F', 'B', 'W'} and \
-                    local_order[rank][i].type.startswith('RECV') and \
-                    "POST_VALIDATION" not in local_order[rank][i].type and \
-                    local_order[rank][i].start_time <= local_order[rank][i - 1].completion_time:
-                    local_order[rank][i], local_order[rank][i - 1] = local_order[rank][i - 1], local_order[rank][i]
-        local_order_with_rollback = [[] for _ in range(self.n_stage)]
-        for rank in range(self.n_stage):
-            rollback_comm = set()
-            if rank > 0:
-                for node in local_order[rank - 1]:
-                    if node.type == "POST_VALIDATION":
-                        break
-                    if node.type == "SEND_FORWARD":
-                        assert node.chunk == 0
-                        rollback_comm.add(node.minibatch)
-            for node in local_order[rank]:
-                if node.type == "RECV_FORWARD" and node.chunk == 0 and node.minibatch in rollback_comm:
-                    rollback = True
-                    rollback_comm.remove(node.minibatch)
-                else:
-                    rollback = False
-                local_order_with_rollback[rank].append(ScheduledNode(
-                    type=node.type,
-                    chunk=node.chunk,
-                    stage=node.stage,
-                    minibatch=node.minibatch,
-                    start_time=node.start_time,
-                    completion_time=node.completion_time,
-                    rollback=rollback,
-                ))
-            assert len(rollback_comm) == 0
-            for node in local_order_with_rollback[rank]:
-                print(f"{node.type}-{node.minibatch}-{int(node.rollback)}", end=', ')
-            print()
-        return local_order_with_rollback
-if __name__ == '__main__':
-    settings = [
-        # p,   n,     f,     b,     w,   c,    h,  a,  l
-        # (8, 24, 18522, 18086, 9337, 601, 2304, 24, 24),
-        # (8, 32, 18513, 18086, 9331, 626, 2304, 24, 24),
-        # (8, 64, 18546, 18097, 9321, 762, 2304, 24, 24),
-        # (8, 24, 29718, 29444, 19927, 527, 4096, 32, 32),
-        # (8, 32, 29802, 29428, 19530, 577, 4096, 32, 32),
-        # (8, 64, 29935, 29621, 19388, 535, 4096, 32, 32),
-        # (16, 48, 11347, 11248, 8132, 377, 5120, 40, 48),
-        # (16, 64, 11307, 11254, 8101, 379, 5120, 40, 48),
-        # (16, 128, 11325, 11308, 8109, 378, 5120, 40, 48),
-        # (32, 96, 10419, 10207, 7715, 408, 6144, 48, 64),
-        # (32, 128, 10408, 10204, 7703, 408, 6144, 48, 64),
-        # (32, 256, 10402, 10248, 7698, 460, 6144, 48, 64),
-        # (4, 8, 6, 4, 4, 1, 4096, 32, 32),
-        # (8, 24, 29444, 29718, 19927, 527, 4096, 32, 32),
-        # ( 8, 32, 16099, 16504,  7589,  540, 2304, 24, 16),
-        (16, 48, 14407, 14380,  9676, 1610, 4096, 32, 32),
-        (16, 64, 14412, 14393,  9688, 1621, 4096, 32, 32),
-        (16, 128,14316, 14306,  9639, 1619, 4096, 32, 32),
-        (24, 72,  6763,  6969,  5251,  755, 5120, 40, 48),
-        (24, 96,  6783,  6984,  5259,  758, 5120, 40, 48),
-        (24, 192, 6785,  6990,  5260,  770, 5120, 40, 48),
-        (32,  96, 9458,  9748,  7288,  879, 6144, 48, 64),
-        (32, 128, 9469,  9744,  7306,  892, 6144, 48, 64),
-        (32, 256, 9447,  9644,  7193,  887, 6144, 48, 64),
-    ]
-    s = 1024
-    # h, a, s = 4096, 32, 1024
-    # cost_f, cost_b, cost_w, cost_c = 29718, 29444, 19927, 527
-    for p, n, f, b, w, c, h, a, _ in settings:
-        mem_f = 34 * h + 5 * a * s
-        mem_w = - 32 * h
-        mem_b = - mem_w - mem_f
-        for m_offset in range(p + 1):
-            graph = PipelineGraph(
-                n_stage=p,
-                n_micro=n,
-                f_cost=f,
-                b_cost=b,
-                w_cost=w,
-                c_cost=c,
-                f_mem=mem_f,
-                b_mem=mem_b,
-                w_mem=mem_w,
-                max_mem=mem_f * (p * 2 + m_offset),
-            )
-            graph.get_v_schedule()
-            break