Zhu-FaceOnLive commited on
Commit
81efcf0
·
1 Parent(s): 675ebfd

Upload 72 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +13 -0
  2. openvino/cache.json +0 -0
  3. openvino/libgna.so +3 -0
  4. openvino/libgna.so.2 +3 -0
  5. openvino/libgna.so.3.0.0.1455 +3 -0
  6. openvino/libopenvino.so +3 -0
  7. openvino/libopenvino_auto_batch_plugin.so +0 -0
  8. openvino/libopenvino_auto_plugin.so +0 -0
  9. openvino/libopenvino_c.so +0 -0
  10. openvino/libopenvino_gapi_preproc.so +3 -0
  11. openvino/libopenvino_hetero_plugin.so +0 -0
  12. openvino/libopenvino_intel_cpu_plugin.so +3 -0
  13. openvino/libopenvino_intel_gna_plugin.so +3 -0
  14. openvino/libopenvino_intel_hddl_plugin.so +3 -0
  15. openvino/libopenvino_intel_myriad_plugin.so +3 -0
  16. openvino/libopenvino_ir_frontend.so +0 -0
  17. openvino/libopenvino_onnx_frontend.so +3 -0
  18. openvino/libopenvino_paddle_frontend.so +0 -0
  19. openvino/libopenvino_tensorflow_fe.so +3 -0
  20. openvino/pcie-ma2x8x.mvcmd +3 -0
  21. openvino/plugins.xml +27 -0
  22. openvino/usb-ma2x8x.mvcmd +3 -0
  23. openvino/vpu_custom_kernels/binarization.bin +3 -0
  24. openvino/vpu_custom_kernels/binarization.cl +67 -0
  25. openvino/vpu_custom_kernels/binary_convolution.bin +3 -0
  26. openvino/vpu_custom_kernels/binary_convolution.cl +95 -0
  27. openvino/vpu_custom_kernels/binary_convolution1x1.bin +3 -0
  28. openvino/vpu_custom_kernels/binary_convolution1x1.cl +117 -0
  29. openvino/vpu_custom_kernels/binary_convolution3x3.bin +3 -0
  30. openvino/vpu_custom_kernels/binary_convolution3x3.cl +278 -0
  31. openvino/vpu_custom_kernels/convolution1x1_chw.bin +3 -0
  32. openvino/vpu_custom_kernels/convolution1x1_chw.cl +114 -0
  33. openvino/vpu_custom_kernels/convolution1x1_hwc.bin +3 -0
  34. openvino/vpu_custom_kernels/convolution1x1_hwc.cl +126 -0
  35. openvino/vpu_custom_kernels/convolution3x3.bin +3 -0
  36. openvino/vpu_custom_kernels/convolution3x3.cl +158 -0
  37. openvino/vpu_custom_kernels/correlate.bin +3 -0
  38. openvino/vpu_custom_kernels/correlate.cl +453 -0
  39. openvino/vpu_custom_kernels/ctc.bin +3 -0
  40. openvino/vpu_custom_kernels/ctc.cl +94 -0
  41. openvino/vpu_custom_kernels/customLayerBindings.xml +507 -0
  42. openvino/vpu_custom_kernels/cvtf32f16.bin +3 -0
  43. openvino/vpu_custom_kernels/cvtf32f16.cl +17 -0
  44. openvino/vpu_custom_kernels/cvtu8f16.bin +3 -0
  45. openvino/vpu_custom_kernels/cvtu8f16.cl +48 -0
  46. openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin +3 -0
  47. openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl +65 -0
  48. openvino/vpu_custom_kernels/fakequantize.bin +3 -0
  49. openvino/vpu_custom_kernels/fakequantize.cl +111 -0
  50. openvino/vpu_custom_kernels/grn.bin +3 -0
.gitattributes CHANGED
@@ -34,3 +34,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  facewrapper/libs/libttvfaceengine6.so filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  facewrapper/libs/libttvfaceengine6.so filter=lfs diff=lfs merge=lfs -text
37
+ openvino/libgna.so filter=lfs diff=lfs merge=lfs -text
38
+ openvino/libgna.so.2 filter=lfs diff=lfs merge=lfs -text
39
+ openvino/libgna.so.3.0.0.1455 filter=lfs diff=lfs merge=lfs -text
40
+ openvino/libopenvino_gapi_preproc.so filter=lfs diff=lfs merge=lfs -text
41
+ openvino/libopenvino_intel_cpu_plugin.so filter=lfs diff=lfs merge=lfs -text
42
+ openvino/libopenvino_intel_gna_plugin.so filter=lfs diff=lfs merge=lfs -text
43
+ openvino/libopenvino_intel_hddl_plugin.so filter=lfs diff=lfs merge=lfs -text
44
+ openvino/libopenvino_intel_myriad_plugin.so filter=lfs diff=lfs merge=lfs -text
45
+ openvino/libopenvino_onnx_frontend.so filter=lfs diff=lfs merge=lfs -text
46
+ openvino/libopenvino_tensorflow_fe.so filter=lfs diff=lfs merge=lfs -text
47
+ openvino/libopenvino.so filter=lfs diff=lfs merge=lfs -text
48
+ openvino/pcie-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
49
+ openvino/usb-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
openvino/cache.json ADDED
The diff for this file is too large to render. See raw diff
 
openvino/libgna.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3
+ size 3120536
openvino/libgna.so.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3
+ size 3120536
openvino/libgna.so.3.0.0.1455 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3
+ size 3120536
openvino/libopenvino.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46
3
+ size 13495416
openvino/libopenvino_auto_batch_plugin.so ADDED
Binary file (391 kB). View file
 
openvino/libopenvino_auto_plugin.so ADDED
Binary file (371 kB). View file
 
openvino/libopenvino_c.so ADDED
Binary file (305 kB). View file
 
openvino/libopenvino_gapi_preproc.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0
3
+ size 1312920
openvino/libopenvino_hetero_plugin.so ADDED
Binary file (367 kB). View file
 
openvino/libopenvino_intel_cpu_plugin.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18
3
+ size 33299880
openvino/libopenvino_intel_gna_plugin.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab
3
+ size 4067016
openvino/libopenvino_intel_hddl_plugin.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b
3
+ size 5894680
openvino/libopenvino_intel_myriad_plugin.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de
3
+ size 6120168
openvino/libopenvino_ir_frontend.so ADDED
Binary file (343 kB). View file
 
openvino/libopenvino_onnx_frontend.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441
3
+ size 3781640
openvino/libopenvino_paddle_frontend.so ADDED
Binary file (987 kB). View file
 
openvino/libopenvino_tensorflow_fe.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9
3
+ size 2723864
openvino/pcie-ma2x8x.mvcmd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d
3
+ size 2099248
openvino/plugins.xml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <ie>
2
+ <plugins>
3
+ <plugin name="AUTO" location="libopenvino_auto_plugin.so">
4
+ <properties>
5
+ <property key="MULTI_WORK_MODE_AS_AUTO" value="YES"/>
6
+ </properties>
7
+ </plugin>
8
+ <plugin name="BATCH" location="libopenvino_auto_batch_plugin.so">
9
+ </plugin>
10
+ <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
11
+ </plugin>
12
+ <plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
13
+ </plugin>
14
+ <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
15
+ </plugin>
16
+ <plugin name="HETERO" location="libopenvino_hetero_plugin.so">
17
+ </plugin>
18
+ <plugin name="MULTI" location="libopenvino_auto_plugin.so">
19
+ </plugin>
20
+ <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
21
+ </plugin>
22
+ <plugin name="HDDL" location="libopenvino_intel_hddl_plugin.so">
23
+ </plugin>
24
+ <plugin name="VPUX" location="libopenvino_intel_vpux_plugin.so">
25
+ </plugin>
26
+ </plugins>
27
+ </ie>
openvino/usb-ma2x8x.mvcmd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e
3
+ size 2298632
openvino/vpu_custom_kernels/binarization.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac
3
+ size 19632
openvino/vpu_custom_kernels/binarization.cl ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void binarization(
9
+ const __global half *__restrict src_data,
10
+ const __global half *__restrict input_low_high,
11
+ const __global half *__restrict dst_data,
12
+ int switch_out,
13
+ int input_low_high_size,
14
+ int W,
15
+ int H)
16
+ {
17
+ __local half local_src[15 * 1024];
18
+ __local half local_dst[15 * 1024];
19
+
20
+ event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
21
+ wait_group_events(1, &e1);
22
+
23
+ int c = get_global_id(2);
24
+ int C = get_global_size(2);
25
+
26
+ half dst_low = switch_out ? 1.h : -1.h;
27
+ half dst_high = switch_out ? -1.h : 1.h;
28
+
29
+ half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
30
+
31
+ for (int h = 0; h < H; h++) {
32
+
33
+ __local const half *__restrict addr_src = local_src + h * W;
34
+ __local half *__restrict addr_dst = local_dst + h * W;
35
+
36
+ #if 1
37
+ for (int w = 0; w < W / 8; w++) {
38
+
39
+ half8 h_src_val8 = (*((__local half8 *)addr_src + w));
40
+
41
+ short8 cond1;
42
+ cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
43
+ cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
44
+ cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
45
+ cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
46
+ cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
47
+ cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
48
+ cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
49
+ cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
50
+
51
+ cond1 = ~(cond1 - (short8)1);
52
+
53
+ short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
54
+
55
+ *((__local half8 *)addr_dst + w) = as_half8(res);
56
+ }
57
+ #endif
58
+ for (int w = W & (~0x7); w < W; w++) {
59
+ addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
60
+ }
61
+ }
62
+
63
+ barrier(CLK_LOCAL_MEM_FENCE);
64
+
65
+ event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
66
+ wait_group_events(1, &e2);
67
+ }
openvino/vpu_custom_kernels/binary_convolution.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a
3
+ size 10764
openvino/vpu_custom_kernels/binary_convolution.cl ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+
7
+ int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
8
+
9
+ __kernel void binary_convolution(
10
+ const __global half *restrict src_data,
11
+ const __global uchar *restrict weights_data,
12
+ __global half *restrict dst_data,
13
+ float pad_value,
14
+
15
+ int IW,
16
+ int IH,
17
+ int IC,
18
+
19
+ int DW,
20
+ int DH,
21
+
22
+ int GC,
23
+
24
+ int KW,
25
+ int KH,
26
+
27
+ int PW,
28
+ int PH,
29
+
30
+ int SW,
31
+ int SH)
32
+ {
33
+ int ipad_value = ((pad_value > 0.f) ? 1 : 0);
34
+ int c = get_global_id(2);
35
+ int y = get_global_id(1);
36
+ int x = get_global_id(0);
37
+
38
+ int OC = get_global_size(2);
39
+ int OH = get_global_size(1);
40
+ int OW = get_global_size(0);
41
+
42
+ int KD = 1;
43
+ int SD = 0;
44
+ int DD = 0;
45
+ int PD = 0;
46
+ int ID = 1;
47
+ int OD = 1;
48
+
49
+ int nbits = 8;
50
+
51
+ int g = c % GC;
52
+ int oc = c / GC;
53
+ int oh = y;
54
+ int ow = x;
55
+
56
+ for (int od = 0; od < OD; od++) {
57
+ int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
58
+
59
+ int res = 0;
60
+
61
+ for (int ic = 0; ic < IC / GC; ic++) {
62
+ for (int kd = 0; kd < KD; kd++) {
63
+ for (int kh = 0; kh < KH; kh++) {
64
+ for (int kw = 0; kw < KW; kw++) {
65
+ int widx = g * OC / GC * IC / GC * KD * KH * KW
66
+ + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
67
+ + kh * KW + kw;
68
+
69
+ int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
70
+
71
+ int s;
72
+
73
+ int iw = ow * SW - PW + kw * DW;
74
+ int ih = oh * SH - PH + kh * DH;
75
+ int id = od * SD - PD + kd * DD;
76
+
77
+ if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
78
+ || id >= (int)ID) {
79
+ s = ipad_value;
80
+ } else {
81
+ int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
82
+ + ih * IW + iw;
83
+
84
+ s = ((src_data[iidx] > 0.f) ? 1 : 0);
85
+ }
86
+
87
+ res += s ^ w;
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
94
+ }
95
+ }
openvino/vpu_custom_kernels/binary_convolution1x1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945
3
+ size 76316
openvino/vpu_custom_kernels/binary_convolution1x1.cl ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
9
+
10
+ __kernel void binary_convolution(
11
+ const __global half *restrict src_data,
12
+ const __global uchar *restrict weights_data,
13
+ __global half *restrict dst_data,
14
+ float pad_value,
15
+
16
+ int IW,
17
+ int IH,
18
+ int IC,
19
+
20
+ int DW,
21
+ int DH,
22
+
23
+ int GC,
24
+
25
+ int KW,
26
+ int KH,
27
+
28
+ int PW,
29
+ int PH,
30
+
31
+ int SW,
32
+ int SH,
33
+
34
+ int OW)
35
+ {
36
+ __local half src_local[32 * 1024];
37
+ __local half dst_local[2 * 1024];
38
+
39
+ const int oh = get_group_id(0);
40
+ const int oc = get_group_id(1);
41
+ const int OH = get_global_size(0);
42
+ const int OC = get_global_size(1);
43
+
44
+ const int gc = oc / (OC / GC);
45
+
46
+ if (oh * SH >= 0 && oh * SH <= IH - 1) {
47
+ const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
48
+
49
+ event_t e1 = async_work_group_copy_2D2D(
50
+ src_local, // dst
51
+ src, // src
52
+ IW, // num_elements_per_line,
53
+ IC / GC, // num_lines,
54
+ IH * IW - IW, // src_line_stride,
55
+ 0, // dst_line_stride,
56
+ 0);
57
+ wait_group_events(1, &e1);
58
+ }
59
+
60
+ half pad_value_half = convert_half(pad_value);
61
+
62
+ //padding row
63
+ if (oh * SH > IH - 1) {
64
+ __local half *dst = src_local;
65
+ for (int c = 0; c < IC / GC; c++) {
66
+ #pragma unroll 8
67
+ for (int j = 0; j < IW; j++) {
68
+ dst[j] = pad_value_half;
69
+ }
70
+ dst += IW;
71
+ }
72
+ }
73
+
74
+ int OWS = SW * OW;
75
+ ushort8 in;
76
+
77
+ for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
78
+ ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
79
+ for (int ic = 0; ic < IC / GC; ++ic) {
80
+ __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
81
+ int weight_pos = oc * IC / GC + ic;
82
+ ushort w =
83
+ extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
84
+
85
+ if ((ows8 * 8) <= IW - 1) {
86
+ in = *((__local ushort8 *)(src));
87
+ }
88
+
89
+ //padding column
90
+ if (ows8 * 8 + 7 > IW - 1) {
91
+ int boundary = (IW - 1) - ows8 * 8 + 1;
92
+ boundary = boundary < 0 ? 0 : boundary;
93
+ for (int offset = boundary; offset < 8; offset++) {
94
+ *((half *)(&in) + offset) = pad_value_half;
95
+ }
96
+ }
97
+
98
+ ushort8 w8 = (ushort8)(w);
99
+
100
+ ushort8 cond =
101
+ (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
102
+
103
+ val += (cond ^ w8);
104
+ }
105
+
106
+ ushort8 val_shift = val << 1;
107
+ int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
108
+ for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
109
+ *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
110
+ }
111
+ }
112
+
113
+ barrier(CLK_LOCAL_MEM_FENCE);
114
+
115
+ event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
116
+ wait_group_events(1, &e2);
117
+ }
openvino/vpu_custom_kernels/binary_convolution3x3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb
3
+ size 104136
openvino/vpu_custom_kernels/binary_convolution3x3.cl ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
9
+
10
+ __kernel void binary_convolution(
11
+ const __global half *restrict src_data,
12
+ const __global uchar *restrict weights_data,
13
+ const __global half *restrict dst_data,
14
+ float pad_value,
15
+
16
+ int IW,
17
+ int IH,
18
+ int IC,
19
+
20
+ int DW,
21
+ int DH,
22
+
23
+ int GC,
24
+
25
+ int KW,
26
+ int KH,
27
+
28
+ int PW,
29
+ int PH,
30
+
31
+ int SW,
32
+ int SH,
33
+
34
+ int OW)
35
+ {
36
+ __local half src_local[32 * 1024];
37
+ __local half dst_local[2 * 1024];
38
+
39
+ const int oh = get_group_id(0);
40
+ const int oc = get_group_id(1);
41
+ const int OH = get_global_size(0);
42
+ const int OC = get_global_size(1);
43
+
44
+ const int gc = oc / (OC / GC);
45
+
46
+ if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
47
+ {
48
+ event_t e = async_work_group_copy_3D3D(
49
+ src_local, // dst
50
+ src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
51
+ IW, // num_elements_per_line
52
+ 3, // num_lines
53
+ DH * IW - IW, // src_line_stride
54
+ 0, // dst_line_stride
55
+ IC / GC, // num planes
56
+ IH * IW - 3 * DH * IW, // src plane stride
57
+ 0, // dst plane stride
58
+ 0);
59
+ wait_group_events(1, &e);
60
+ } else {
61
+ int ih = oh * SH - 1;
62
+ if (ih >= 0 && ih <= IH - 1) //dma for first row
63
+ {
64
+ event_t e = async_work_group_copy_2D2D(
65
+ src_local, // dst
66
+ src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
67
+ IW, // num_elements_per_line,
68
+ IC / GC, // num_lines,
69
+ IH * IW - IW, // src_line_stride,
70
+ 2 * IW, // dst_line_stride,
71
+ 0);
72
+
73
+ wait_group_events(1, &e);
74
+ }
75
+ ih = oh * SH - 1 + DH;
76
+ if (ih >= 0 && ih <= IH - 1) //dma for second row
77
+ {
78
+ event_t e = async_work_group_copy_2D2D(
79
+ src_local + IW, // dst
80
+ src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
81
+ IW, // num_elements_per_line,
82
+ IC / GC, // num_lines,
83
+ IH * IW - IW, // src_line_stride,
84
+ 2 * IW, // dst_line_stride,
85
+ 0);
86
+ wait_group_events(1, &e);
87
+ }
88
+ ih = oh * SH - 1 + 2 * DH;
89
+ if (ih >= 0 && ih <= IH - 1) //dma for third row
90
+ {
91
+ event_t e = async_work_group_copy_2D2D(
92
+ src_local + 2 * IW, // dst
93
+ src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
94
+ IW, // num_elements_per_line,
95
+ IC / GC, // num_lines,
96
+ IH * IW - IW, // src_line_stride,
97
+ 2 * IW, // dst_line_stride,
98
+ 0);
99
+ wait_group_events(1, &e);
100
+ }
101
+ }
102
+
103
+ half pad_value_half = convert_half(pad_value);
104
+
105
+ //padding row
106
+ if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
107
+ __local half *dst = src_local;
108
+ for (int c = 0; c < IC / GC; c++) {
109
+ #pragma unroll 8
110
+ for (int j = 0; j < IW; j++) {
111
+ dst[j] = pad_value_half;
112
+ }
113
+ dst += 3 * IW;
114
+ }
115
+ }
116
+ if (oh * SH + DH - 1 > IH - 1) {
117
+ __local half *dst = src_local + IW;
118
+ for (int c = 0; c < IC / GC; c++) {
119
+ #pragma unroll 8
120
+ for (int j = 0; j < IW; j++) {
121
+ dst[j] = pad_value_half;
122
+ }
123
+ dst += 3 * IW;
124
+ }
125
+ }
126
+ if (oh * SH + DH + DH - 1 > IH - 1) {
127
+ __local half *dst = src_local + 2 * IW;
128
+ for (int c = 0; c < IC / GC; c++) {
129
+ #pragma unroll 8
130
+ for (int j = 0; j < IW; j++) {
131
+ dst[j] = pad_value_half;
132
+ }
133
+ dst += 3 * IW;
134
+ }
135
+ }
136
+
137
+ int OWS = SW * OW;
138
+
139
+ ushort8 in00;
140
+ ushort8 in01;
141
+ ushort8 in02;
142
+ ushort8 in10;
143
+ ushort8 in11;
144
+ ushort8 in12;
145
+ ushort8 in20;
146
+ ushort8 in21;
147
+ ushort8 in22;
148
+
149
+ for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
150
+ ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
151
+ for (int ic = 0; ic < IC / GC; ++ic) {
152
+ __local half *src =
153
+ (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
154
+ int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
155
+ ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
156
+ ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
157
+ ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
158
+ ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8));
159
+ ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8));
160
+ ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8));
161
+ ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8));
162
+ ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
163
+ ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
164
+
165
+ if ((ows8 * 8) - 1 <= IW - 1) {
166
+ in00 = *((__local ushort8 *)(src - IW - DW));
167
+ in01 = *((__local ushort8 *)(src - IW));
168
+ in02 = *((__local ushort8 *)(src - IW + DW));
169
+
170
+ in10 = *((__local ushort8 *)(src - DW));
171
+ in11 = *((__local ushort8 *)(src));
172
+ in12 = *((__local ushort8 *)(src + DW));
173
+
174
+ in20 = *((__local ushort8 *)(src + IW - DW));
175
+ in21 = *((__local ushort8 *)(src + IW));
176
+ in22 = *((__local ushort8 *)(src + IW + DW));
177
+ }
178
+
179
+ //padding column
180
+ if (ows8 * 8 - 1 < 0) {
181
+ int boundary = 1 - ows8 * 8;
182
+ boundary = boundary > 8 ? 8 : boundary;
183
+ for (int offset = 0; offset < boundary; offset++) {
184
+ *((half *)(&in00) + offset) = pad_value_half;
185
+ *((half *)(&in10) + offset) = pad_value_half;
186
+ *((half *)(&in20) + offset) = pad_value_half;
187
+ }
188
+ }
189
+ if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
190
+ int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
191
+ boundary = boundary < 0 ? 0 : boundary;
192
+ for (int offset = boundary; offset < 8; offset++) {
193
+ *((half *)(&in02) + offset) = pad_value_half;
194
+ *((half *)(&in12) + offset) = pad_value_half;
195
+ *((half *)(&in22) + offset) = pad_value_half;
196
+ }
197
+ }
198
+ if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
199
+ int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
200
+ boundary = boundary < 0 ? 0 : boundary;
201
+ for (int offset = boundary; offset < 8; offset++) {
202
+ *((half *)(&in01) + offset) = pad_value_half;
203
+ *((half *)(&in11) + offset) = pad_value_half;
204
+ *((half *)(&in21) + offset) = pad_value_half;
205
+ }
206
+ }
207
+ if ((ows8 * 8 + 7) - 1 > IW - 1) {
208
+ int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
209
+ boundary = boundary < 0 ? 0 : boundary;
210
+ for (int offset = boundary; offset < 8; offset++) {
211
+ *((half *)(&in00) + offset) = pad_value_half;
212
+ *((half *)(&in10) + offset) = pad_value_half;
213
+ *((half *)(&in20) + offset) = pad_value_half;
214
+ }
215
+ }
216
+
217
+ ushort8 w00 = (ushort8)(w0);
218
+ ushort8 w01 = (ushort8)(w1);
219
+ ushort8 w02 = (ushort8)(w2);
220
+ ushort8 w10 = (ushort8)(w3);
221
+ ushort8 w11 = (ushort8)(w4);
222
+ ushort8 w12 = (ushort8)(w5);
223
+ ushort8 w20 = (ushort8)(w6);
224
+ ushort8 w21 = (ushort8)(w7);
225
+ ushort8 w22 = (ushort8)(w8);
226
+
227
+ ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
228
+ (ushort8)(1) :
229
+ (ushort8)(0);
230
+ ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
231
+ (ushort8)(1) :
232
+ (ushort8)(0);
233
+ ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
234
+ (ushort8)(1) :
235
+ (ushort8)(0);
236
+ ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
237
+ (ushort8)(1) :
238
+ (ushort8)(0);
239
+ ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
240
+ (ushort8)(1) :
241
+ (ushort8)(0);
242
+ ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
243
+ (ushort8)(1) :
244
+ (ushort8)(0);
245
+ ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
246
+ (ushort8)(1) :
247
+ (ushort8)(0);
248
+ ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
249
+ (ushort8)(1) :
250
+ (ushort8)(0);
251
+ ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
252
+ (ushort8)(1) :
253
+ (ushort8)(0);
254
+
255
+ val += (cond0 ^ w00);
256
+ val += (cond1 ^ w01);
257
+ val += (cond2 ^ w02);
258
+ val += (cond3 ^ w10);
259
+ val += (cond4 ^ w11);
260
+ val += (cond5 ^ w12);
261
+ val += (cond6 ^ w20);
262
+ val += (cond7 ^ w21);
263
+ val += (cond8 ^ w22);
264
+ }
265
+
266
+ ushort8 val_shift = val << 1;
267
+ int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
268
+ for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
269
+ *(dst_local + ow) =
270
+ (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
271
+ }
272
+ }
273
+
274
+ barrier(CLK_LOCAL_MEM_FENCE);
275
+
276
+ event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
277
+ wait_group_events(1, &e2);
278
+ }
openvino/vpu_custom_kernels/convolution1x1_chw.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f
3
+ size 9220
openvino/vpu_custom_kernels/convolution1x1_chw.cl ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void Convolution1x1_NCHW(
9
+ const __global half *in,
10
+ const __global half *out,
11
+ const __global half *w,
12
+ int IW,
13
+ int IH,
14
+ int IC,
15
+ int OW,
16
+ int OH,
17
+ int OC)
18
+ {
19
+ __local half in_local[8 * 1024];
20
+ __local half out_local[8 * 1024];
21
+
22
+ event_t e1 = async_work_group_copy_2D2D(
23
+ in_local, // dst
24
+ in + get_group_id(0) * IW, // src
25
+ IW, // num_elements_per_line,
26
+ IC, // num_lines,
27
+ IW * IH - IW, // src_line_stride,
28
+ 0, // dst_line_stride,
29
+ 0);
30
+ wait_group_events(1, &e1);
31
+
32
+ int oh = get_global_id(0);
33
+ int oc = get_global_id(1);
34
+
35
+ int stride;
36
+ int write_output = 0;
37
+ __global half *src;
38
+
39
+ __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
40
+ __global half *w1 = (__global half *)(&w[oc * IC]);
41
+
42
+ for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
43
+ uint iw = ow;
44
+ uint ih = oh;
45
+
46
+ half8 val8_0 = 0.0f;
47
+
48
+ __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
49
+ __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
50
+ __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
51
+ __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
52
+ __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
53
+ __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
54
+ __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
55
+ __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
56
+
57
+ for (uint ic = 0; ic < IC / 8; ic++) {
58
+ val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
59
+ val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
60
+ val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
61
+ val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
62
+ val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
63
+ val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
64
+ val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
65
+ val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
66
+ }
67
+
68
+ for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
69
+ val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
70
+ }
71
+ *((__local half8 *)&out_local[ow + 0]) = (val8_0);
72
+ }
73
+
74
+ uint iw = (OW & (~0x7));
75
+ uint ih = oh;
76
+
77
+ half8 val8_0 = 0.0f;
78
+
79
+ __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
80
+ __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
81
+ __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
82
+ __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
83
+ __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
84
+ __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
85
+ __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
86
+ __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
87
+
88
+ for (uint ic = 0; ic < IC / 8; ic++) {
89
+ val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
90
+ val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
91
+ val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
92
+ val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
93
+ val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
94
+ val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
95
+ val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
96
+ val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
97
+ }
98
+
99
+ for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
100
+ val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
101
+ }
102
+ for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
103
+ out_local[ow + 0] = (val8_0[ow % 8]);
104
+ }
105
+
106
+ barrier(CLK_LOCAL_MEM_FENCE);
107
+
108
+ event_t e2 = async_work_group_copy(
109
+ out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
110
+ out_local,
111
+ OW,
112
+ 0);
113
+ wait_group_events(1, &e2);
114
+ }
openvino/vpu_custom_kernels/convolution1x1_hwc.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c
3
+ size 4304
openvino/vpu_custom_kernels/convolution1x1_hwc.cl ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void Convolution1x1_NHWC(
9
+ const __global half *in,
10
+ const __global half *out,
11
+ const __global half *w,
12
+ int IW,
13
+ int IH,
14
+ int IC,
15
+ int OW,
16
+ int OH,
17
+ int OC)
18
+ {
19
+
20
+ __local half in_local[8 * 1024];
21
+ __local half out_local[8 * 1024];
22
+
23
+ const int sizeAct = IW * IC;
24
+
25
+ event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
26
+ wait_group_events(1, &e1);
27
+
28
+ int oh = get_global_id(0);
29
+ int oc = get_global_id(1);
30
+
31
+ int stride;
32
+ int write_output = 0;
33
+ __global half *src;
34
+
35
+ __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
36
+ __global half *w1 = (__global half *)(&w[oc * IC]);
37
+
38
+ for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
39
+ uint iw = ow;
40
+ uint ih = oh;
41
+
42
+ half8 val8_0 = 0.0f;
43
+ half8 val8_1 = 0.0f;
44
+ half8 val8_2 = 0.0f;
45
+ half8 val8_3 = 0.0f;
46
+ half8 val8_4 = 0.0f;
47
+ half8 val8_5 = 0.0f;
48
+ half8 val8_6 = 0.0f;
49
+ half8 val8_7 = 0.0f;
50
+
51
+ __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
52
+ __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
53
+ __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
54
+ __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
55
+ __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
56
+ __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
57
+ __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
58
+ __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
59
+
60
+ for (uint ic = 0; ic < IC / 8; ++ic) {
61
+ val8_0 += (in8_0[ic]) * (w8[ic]);
62
+ val8_1 += (in8_1[ic]) * (w8[ic]);
63
+ val8_2 += (in8_2[ic]) * (w8[ic]);
64
+ val8_3 += (in8_3[ic]) * (w8[ic]);
65
+ val8_4 += (in8_4[ic]) * (w8[ic]);
66
+ val8_5 += (in8_5[ic]) * (w8[ic]);
67
+ val8_6 += (in8_6[ic]) * (w8[ic]);
68
+ val8_7 += (in8_7[ic]) * (w8[ic]);
69
+ }
70
+
71
+ half val_0 = 0.0f;
72
+ half val_1 = 0.0f;
73
+ half val_2 = 0.0f;
74
+ half val_3 = 0.0f;
75
+ half val_4 = 0.0f;
76
+ half val_5 = 0.0f;
77
+ half val_6 = 0.0f;
78
+ half val_7 = 0.0f;
79
+ for (uint ic = IC & (~0x7); ic < IC; ++ic) {
80
+ val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
81
+ val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
82
+ val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
83
+ val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
84
+ val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
85
+ val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
86
+ val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
87
+ val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
88
+ }
89
+ out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
90
+ out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
91
+ out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
92
+ out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
93
+ out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
94
+ out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
95
+ out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
96
+ out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
97
+ }
98
+ for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
99
+
100
+ uint iw = ow;
101
+ uint ih = oh;
102
+
103
+ half8 val8 = 0.0f;
104
+
105
+ __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
106
+
107
+ for (uint ic = 0; ic < IC / 8; ++ic) {
108
+ val8 += (in8[ic]) * (w8[ic]);
109
+ }
110
+
111
+ half val = 0.0f;
112
+ for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
113
+ val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
114
+ }
115
+ out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
116
+ }
117
+
118
+ barrier(CLK_LOCAL_MEM_FENCE);
119
+
120
+ event_t e2 = async_work_group_copy(
121
+ out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
122
+ out_local,
123
+ OW,
124
+ 0);
125
+ wait_group_events(1, &e2);
126
+ }
openvino/vpu_custom_kernels/convolution3x3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:021bb40840ff35506972e6f6a7dea1b5f40a8db0927aaa9a6c116b152e386851
3
+ size 5748
openvino/vpu_custom_kernels/convolution3x3.cl ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void Convolution3x3(
9
+ const __global half *in_param,
10
+ const __global half *out,
11
+ const __global half *w,
12
+ int IW,
13
+ int IH,
14
+ int IC,
15
+ int OW,
16
+ int OH,
17
+ int OC,
18
+ int KX,
19
+ int KY,
20
+ int stride_x,
21
+ int stride_y,
22
+ int pad_x,
23
+ int pad_y,
24
+ int dilation_x,
25
+ int dilation_y)
26
+ {
27
+ __local half in_local[8 * 1024];
28
+ __local half out_local[8 * 1024];
29
+ __local half w_local[8 * 1024];
30
+
31
+ const int sizePlane = IW * IH;
32
+ event_t e1 = async_work_group_copy_2D2D(
33
+ in_local, // dst
34
+ in_param + get_group_id(0) * stride_y * IW, // src
35
+ 3 * IW, // num_elements_per_line,
36
+ IC, // num_lines,
37
+ IW * IH - 3 * IW, // src_line_stride,
38
+ 0, // dst_line_stride,
39
+ 0);
40
+ wait_group_events(1, &e1);
41
+
42
+ const int sizeWeight = IC * 3 * 3;
43
+ e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
44
+ wait_group_events(1, &e1);
45
+
46
+ int oh = get_global_id(0);
47
+ int oc = get_global_id(1);
48
+
49
+ __local half *in = (__local half *)in_local + 1;
50
+
51
+ int stride;
52
+ int write_output = 0;
53
+ __local half *src;
54
+
55
+ if ((stride_x == 1) && (stride_y == 1)) {
56
+ stride = OW / 8;
57
+ write_output = 1;
58
+ }
59
+ if ((stride_x == 2) && (stride_y == 2)) {
60
+ stride = OW / 4;
61
+ write_output = 2;
62
+ }
63
+
64
+ for (int ow = 0; ow < stride; ow++) {
65
+ float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
66
+ for (int ic = 0; ic < IC; ++ic) {
67
+ src = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
68
+ __local half *k = (__local half *)(w_local + ic * 3 * 3);
69
+
70
+ half8 aux_in00 = *((__local half8 *)src - 1);
71
+ half8 aux_in01 = *((__local half8 *)src + 0);
72
+ half8 aux_in02 = *((__local half8 *)src + 1);
73
+ half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
74
+ half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
75
+ half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
76
+ half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
77
+ half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
78
+ half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);
79
+
80
+ short8 in00 = *((short8 *)&aux_in00);
81
+ short8 in01 = *((short8 *)&aux_in01);
82
+ short8 in02 = *((short8 *)&aux_in02);
83
+ short8 in10 = *((short8 *)&aux_in10);
84
+ short8 in11 = *((short8 *)&aux_in11);
85
+ short8 in12 = *((short8 *)&aux_in12);
86
+ short8 in20 = *((short8 *)&aux_in20);
87
+ short8 in21 = *((short8 *)&aux_in21);
88
+ short8 in22 = *((short8 *)&aux_in22);
89
+
90
+ short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
91
+ short8 aux_aux01 = in01;
92
+ short8 aux_aux02 = __builtin_shave_cmu_alignvec_rri_short8(in01, in02, 2);
93
+ short8 aux_aux10 = __builtin_shave_cmu_alignvec_rri_short8(in10, in11, 14);
94
+ short8 aux_aux11 = in11;
95
+ short8 aux_aux12 = __builtin_shave_cmu_alignvec_rri_short8(in11, in12, 2);
96
+ short8 aux_aux20 = __builtin_shave_cmu_alignvec_rri_short8(in20, in21, 14);
97
+ short8 aux_aux21 = in21;
98
+ short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);
99
+
100
+ half8 aux00 = *((half8 *)&aux_aux00);
101
+ half8 aux01 = *((half8 *)&aux_aux01);
102
+ half8 aux02 = *((half8 *)&aux_aux02);
103
+ half8 aux10 = *((half8 *)&aux_aux10);
104
+ half8 aux11 = *((half8 *)&aux_aux11);
105
+ half8 aux12 = *((half8 *)&aux_aux12);
106
+ half8 aux20 = *((half8 *)&aux_aux20);
107
+ half8 aux21 = *((half8 *)&aux_aux21);
108
+ half8 aux22 = *((half8 *)&aux_aux22);
109
+
110
+ half8 w00 = (half8)(*(k + 0));
111
+ half8 w01 = (half8)(*(k + 1));
112
+ half8 w02 = (half8)(*(k + 2));
113
+ half8 w10 = (half8)(*(k + 3));
114
+ half8 w11 = (half8)(*(k + 4));
115
+ half8 w12 = (half8)(*(k + 5));
116
+ half8 w20 = (half8)(*(k + 6));
117
+ half8 w21 = (half8)(*(k + 7));
118
+ half8 w22 = (half8)(*(k + 8));
119
+
120
+ val += convert_float8(aux00) * convert_float8(w00);
121
+ val += convert_float8(aux01) * convert_float8(w01);
122
+ val += convert_float8(aux02) * convert_float8(w02);
123
+ val += convert_float8(aux10) * convert_float8(w10);
124
+ val += convert_float8(aux11) * convert_float8(w11);
125
+ val += convert_float8(aux12) * convert_float8(w12);
126
+ val += convert_float8(aux20) * convert_float8(w20);
127
+ val += convert_float8(aux21) * convert_float8(w21);
128
+ val += convert_float8(aux22) * convert_float8(w22);
129
+ }
130
+ if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
131
+ if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
132
+ }
133
+
134
+ for (int ow = OW & ~(0x7); ow < OW; ow++) {
135
+ float val = 0.0f;
136
+ for (int ic = 0; ic < IC; ++ic) {
137
+ for (int ky = 0; ky < 3; ++ky) {
138
+ for (int kx = 0; kx < 3; ++kx) {
139
+ int iw = ow * stride_x - pad_x + kx * dilation_x;
140
+ int ih = oh * stride_y - pad_y + ky * dilation_y;
141
+
142
+ val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
143
+ * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
144
+ }
145
+ }
146
+ }
147
+ out_local[ow] = convert_half(val);
148
+ }
149
+
150
+ barrier(CLK_LOCAL_MEM_FENCE);
151
+
152
+ event_t e2 = async_work_group_copy(
153
+ out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
154
+ out_local,
155
+ OW,
156
+ 0);
157
+ wait_group_events(1, &e2);
158
+ }
openvino/vpu_custom_kernels/correlate.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2b24b1b5bfd1786128682ee814230653b4b63aad5b472feec9c6f4a4c833e2f
3
+ size 14336
openvino/vpu_custom_kernels/correlate.cl ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+
7
+ #define MAX_OPENCL_BUFF_SIZE 64 * 1024
8
+
9
+ #define USE_DMA 1
10
+
11
+ #if defined(USE_DMA)
12
+ void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride)
13
+ {
14
+ item_dma_event_t copyEvent =
15
+ WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
16
+ WaitWorkItemDmaEvents(1, &copyEvent);
17
+ }
18
+
19
+ void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride)
20
+ {
21
+ item_dma_event_t copyEvent =
22
+ WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
23
+ WaitWorkItemDmaEvents(1, &copyEvent);
24
+ }
25
+ #endif
26
+
27
+ void memzero(void *ptr, size_t num)
28
+ {
29
+ float4 *line0_ = (float4 *)ptr;
30
+ #pragma unroll 16
31
+ for (int i = 0; i < num / 16; i++) {
32
+ line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
33
+ }
34
+ uchar *ptr_ = (uchar *)ptr;
35
+ for (int i = num / 16 * 16; i < num; i++) {
36
+ ptr_[i] = 0;
37
+ }
38
+ }
39
+
40
+ void __attribute__((noinline)) crosscorrh(
41
+ __private const half *restrict line0,
42
+ __private const half *restrict line1,
43
+ __private half *restrict dline,
44
+ int topwidth,
45
+ int max_displacement,
46
+ int neighborhood_grid_radius,
47
+ int kernel_size,
48
+ int padding,
49
+ int bottomwidth,
50
+ int stride1,
51
+ int stride2,
52
+ int max_channels,
53
+ int cur_subchannels)
54
+ {
55
+ if (max_channels == 64) {
56
+ for (int i = 0; i < kernel_size; i++) {
57
+ int x1 = max_displacement - padding + i;
58
+ int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
59
+ x1 += offset1 * stride1;
60
+
61
+ for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
62
+ int x2 = x1 - neighborhood_grid_radius * stride2;
63
+ int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
64
+ x2 += offset2 * stride2;
65
+
66
+ for (int top_channel_x = offset2 - neighborhood_grid_radius;
67
+ top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
68
+ top_channel_x++, x2 += stride2) {
69
+ half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
70
+
71
+ half8 *src0 = (half8 *)(line0 + x1 * max_channels);
72
+ half8 *src1 = (half8 *)(line1 + x2 * max_channels);
73
+
74
+ #pragma unroll 8
75
+ for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);
76
+
77
+ half sum = __builtin_shave_sau_sumx_f16_r(sum4);
78
+ dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
79
+ }
80
+ }
81
+ }
82
+ } else {
83
+ int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;
84
+
85
+ for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
86
+ for (int i = 0; i < kernel_size; i++) {
87
+ int x1 = blockIdx_x * stride1 + max_displacement + i - padding;
88
+
89
+ if ((x1 >= 0) && (x1 < bottomwidth)) {
90
+ int o_min = -neighborhood_grid_radius * stride2;
91
+ int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
92
+ if ((o_min) < (-x1)) {
93
+ o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
94
+ }
95
+ if ((o_max) >= (bottomwidth + stride2 - x1)) {
96
+ o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
97
+ }
98
+
99
+ int o = o_min;
100
+ for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
101
+ half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
102
+ half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels);
103
+ half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels);
104
+ half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels);
105
+ half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels);
106
+
107
+ int c = 0;
108
+
109
+ half8 sum40 = 0;
110
+ half8 sum41 = 0;
111
+ half8 sum42 = 0;
112
+ half8 sum43 = 0;
113
+
114
+ for (; c <= cur_subchannels / 8 - 4; c += 4) {
115
+ sum40 += bottom0[c + 0] * bottom1_0[c + 0];
116
+ sum40 += bottom0[c + 1] * bottom1_0[c + 1];
117
+ sum40 += bottom0[c + 2] * bottom1_0[c + 2];
118
+ sum40 += bottom0[c + 3] * bottom1_0[c + 3];
119
+
120
+ sum41 += bottom0[c + 0] * bottom1_1[c + 0];
121
+ sum41 += bottom0[c + 1] * bottom1_1[c + 1];
122
+ sum41 += bottom0[c + 2] * bottom1_1[c + 2];
123
+ sum41 += bottom0[c + 3] * bottom1_1[c + 3];
124
+
125
+ sum42 += bottom0[c + 0] * bottom1_2[c + 0];
126
+ sum42 += bottom0[c + 1] * bottom1_2[c + 1];
127
+ sum42 += bottom0[c + 2] * bottom1_2[c + 2];
128
+ sum42 += bottom0[c + 3] * bottom1_2[c + 3];
129
+
130
+ sum43 += bottom0[c + 0] * bottom1_3[c + 0];
131
+ sum43 += bottom0[c + 1] * bottom1_3[c + 1];
132
+ sum43 += bottom0[c + 2] * bottom1_3[c + 2];
133
+ sum43 += bottom0[c + 3] * bottom1_3[c + 3];
134
+ }
135
+
136
+ for (; c < cur_subchannels / 8; c++) {
137
+ sum40 += bottom0[c] * bottom1_0[c];
138
+ sum41 += bottom0[c] * bottom1_1[c];
139
+ sum42 += bottom0[c] * bottom1_2[c];
140
+ sum43 += bottom0[c] * bottom1_3[c];
141
+ }
142
+
143
+ half sum0 = __builtin_shave_sau_sumx_f16_r(sum40);
144
+ half sum1 = __builtin_shave_sau_sumx_f16_r(sum41);
145
+ half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
146
+ half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);
147
+
148
+ for (c = c * 8; c < cur_subchannels; c++) {
149
+ sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
150
+ sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
151
+ sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
152
+ sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
153
+ }
154
+
155
+ dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
156
+ sum0;
157
+ dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
158
+ sum1;
159
+ dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
160
+ sum2;
161
+ dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
162
+ sum3;
163
+ }
164
+
165
+ for (; o < o_max; o += 1 * stride2) {
166
+ half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
167
+ half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels);
168
+
169
+ int c = 0;
170
+
171
+ half8 sum4 = 0;
172
+ for (; c <= cur_subchannels / 8 - 4; c += 4) {
173
+ sum4 += bottom0[c + 0] * bottom1[c + 0];
174
+ sum4 += bottom0[c + 1] * bottom1[c + 1];
175
+ sum4 += bottom0[c + 2] * bottom1[c + 2];
176
+ sum4 += bottom0[c + 3] * bottom1[c + 3];
177
+ }
178
+ for (; c < cur_subchannels / 8; c++) {
179
+ sum4 += bottom0[c] * bottom1[c];
180
+ }
181
+
182
+ half sum = __builtin_shave_sau_sumx_f16_r(sum4);
183
+
184
+ for (c = c * 8; c < cur_subchannels; c++) {
185
+ sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
186
+ }
187
+
188
+ dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
189
+ }
190
+ }
191
+ }
192
+ }
193
+ }
194
+ }
195
+
196
+ __kernel void correlate2_half(
197
+ __global const half *restrict bottom0,
198
+ __global const half *restrict bottom1,
199
+ __global half *restrict top,
200
+ int topwidth,
201
+ int topheight,
202
+ int bottomwidth,
203
+ int bottomheight,
204
+ int bottomchannels,
205
+ int max_displacement,
206
+ int padding,
207
+ int neighborhood_grid_radius,
208
+ int neighborhood_grid_width,
209
+ int kernel_size,
210
+ int stride1,
211
+ int stride2)
212
+ {
213
+ int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
214
+ if (max_channels > 64) max_channels = 64;
215
+ int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
216
+ int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count;
217
+ if (subchannels < max_channels) subchannels = max_channels;
218
+
219
+ const int sumelems = kernel_size * kernel_size * bottomchannels;
220
+
221
+ __private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];
222
+
223
+ __private half *line0 = cmx;
224
+ __private half *line1 = line0 + bottomwidth * subchannels;
225
+ __private half *dline = line1 + bottomwidth * subchannels;
226
+
227
+ int blockIdx_y = get_global_id(0);
228
+
229
+ #if defined(USE_DMA)
230
+ __private half *dmabuf = dline + topwidth * neighborhood_grid_width;
231
+ #endif
232
+
233
+ int y1 = blockIdx_y * stride1 + max_displacement;
234
+
235
+ for (int j = 0; j < kernel_size; j++) {
236
+ for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
237
+ // configure channel batching
238
+ int startchannel = bottomchannel;
239
+ int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
240
+ int deltachannels = endchannel - startchannel;
241
+
242
+ // load line form blob 0 with repackaging
243
+ if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
244
+ #if defined(USE_DMA)
245
+ __global const half *curr =
246
+ bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
247
+ dmacpyLineSrcStrideStart(
248
+ curr,
249
+ dmabuf,
250
+ bottomwidth * deltachannels * sizeof(half),
251
+ bottomwidth * sizeof(half),
252
+ bottomwidth * bottomheight * sizeof(half));
253
+
254
+ for (int ch = 0; ch < deltachannels; ch++) {
255
+ for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
256
+ half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
257
+ line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
258
+ line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
259
+ line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
260
+ line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
261
+
262
+ line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
263
+ line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
264
+ line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
265
+ line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
266
+ }
267
+
268
+ for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
269
+ line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
270
+ }
271
+ }
272
+
273
+ if (deltachannels < subchannels)
274
+ for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
275
+ memzero(
276
+ line0 + blockIdx_x * max_channels + deltachannels,
277
+ (subchannels - deltachannels) * sizeof(half));
278
+ #else
279
+ for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
280
+ for (int ch = 0; ch < deltachannels; ch++)
281
+ line0[blockIdx_x * max_channels + ch] = bottom0
282
+ [(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
283
+ + blockIdx_x];
284
+
285
+ if (deltachannels < subchannels)
286
+ memzero(
287
+ line0 + blockIdx_x * max_channels + deltachannels,
288
+ (subchannels - deltachannels) * sizeof(half));
289
+ }
290
+ #endif
291
+ } else
292
+ memzero(line0, max_channels * bottomwidth * sizeof(half));
293
+
294
+ for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
295
+ int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;
296
+
297
+ if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
298
+ #if defined(USE_DMA)
299
+ __global const half *curr =
300
+ bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
301
+ dmacpyLineSrcStrideStart(
302
+ curr,
303
+ dmabuf,
304
+ bottomwidth * deltachannels * sizeof(half),
305
+ bottomwidth * sizeof(half),
306
+ bottomwidth * bottomheight * sizeof(half));
307
+
308
+ for (int ch = 0; ch < deltachannels; ch++) {
309
+ for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
310
+ half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
311
+ line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
312
+ line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
313
+ line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
314
+ line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
315
+
316
+ line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
317
+ line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
318
+ line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
319
+ line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
320
+ }
321
+
322
+ for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
323
+ line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
324
+ }
325
+ }
326
+ #else
327
+ for (int ch = 0; ch < deltachannels; ch++) {
328
+ for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
329
+ half8 val = ((
330
+ __global half8
331
+ *)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
332
+ [blockIdx_x];
333
+ line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
334
+ line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
335
+ line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
336
+ line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
337
+
338
+ line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
339
+ line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
340
+ line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
341
+ line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
342
+ }
343
+ for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
344
+ half val =
345
+ (bottom1 + (ch + startchannel) * bottomheight * bottomwidth
346
+ + (y2 + j - padding) * bottomwidth)[blockIdx_x];
347
+ line1[(blockIdx_x)*max_channels + ch] = val;
348
+ }
349
+ }
350
+ #endif
351
+ for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
352
+ if (deltachannels < subchannels)
353
+ memzero(
354
+ line1 + blockIdx_x * max_channels + deltachannels,
355
+ (subchannels - deltachannels) * sizeof(half));
356
+ }
357
+ } else
358
+ memzero(line1, max_channels * bottomwidth * sizeof(half));
359
+
360
+ if (j == 0 && startchannel == 0) {
361
+ memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
362
+ } else {
363
+ #if defined(USE_DMA)
364
+ dmacpyLineSrcStrideStart(
365
+ top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
366
+ dline,
367
+ topwidth * neighborhood_grid_width * sizeof(half),
368
+ topwidth * sizeof(half),
369
+ topwidth * topheight * sizeof(half));
370
+ #else
371
+ for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
372
+ for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
373
+ half8 val = ((
374
+ __global half8
375
+ *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
376
+ [blockIdx_x];
377
+ ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val;
378
+ }
379
+ for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
380
+ dline[top_channel_x * topwidth + blockIdx_x] =
381
+ top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
382
+ + blockIdx_y * topwidth + blockIdx_x];
383
+ }
384
+ }
385
+ #endif
386
+ }
387
+
388
+ if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
389
+ && y2 + j - padding < bottomheight) {
390
+ crosscorrh(
391
+ line0,
392
+ line1,
393
+ dline,
394
+ topwidth,
395
+ max_displacement,
396
+ neighborhood_grid_radius,
397
+ kernel_size,
398
+ padding,
399
+ bottomwidth,
400
+ stride1,
401
+ stride2,
402
+ max_channels,
403
+ subchannels);
404
+ }
405
+
406
+ if (j == kernel_size - 1 && endchannel == bottomchannels) {
407
+ half8 scale = (half8){
408
+ (half)sumelems,
409
+ (half)sumelems,
410
+ (half)sumelems,
411
+ (half)sumelems,
412
+ (half)sumelems,
413
+ (half)sumelems,
414
+ (half)sumelems,
415
+ (half)sumelems};
416
+ for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
417
+ for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
418
+ ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] =
419
+ ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale;
420
+ }
421
+ for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
422
+ dline[top_channel_x * topwidth + blockIdx_x] =
423
+ dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
424
+ }
425
+ }
426
+ }
427
+
428
+ #if defined(USE_DMA)
429
+ dmacpyLineDstStrideStart(
430
+ dline,
431
+ top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
432
+ topwidth * neighborhood_grid_width * sizeof(half),
433
+ topwidth * sizeof(half),
434
+ topwidth * topheight * sizeof(half));
435
+ #else
436
+ for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
437
+ for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
438
+ ((__global half8
439
+ *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
440
+ [blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x]
441
+ + (half8){0, 0, 0, 0, 0, 0, 0, 0};
442
+ }
443
+ for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
444
+ top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
445
+ + blockIdx_y * topwidth + blockIdx_x] =
446
+ dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
447
+ }
448
+ }
449
+ #endif
450
+ }
451
+ }
452
+ }
453
+ }
openvino/vpu_custom_kernels/ctc.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:292de0fbb8dc6ead6970576d1b9a26a323fc9febfceb92c3af6b84496d523def
3
+ size 10196
openvino/vpu_custom_kernels/ctc.cl ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __global half *find(__global const half *begin, __global const half *end, half value)
9
+ {
10
+ while (begin != end) {
11
+ if (*begin == value) {
12
+ return begin;
13
+ }
14
+ ++begin;
15
+ }
16
+ return end;
17
+ }
18
+
19
+ __kernel void CTCDecoder(
20
+ __global half *restrict probabilities,
21
+ __global half *restrict sequence_indicators,
22
+ __global half *restrict output,
23
+ int width,
24
+ int height,
25
+ int channels)
26
+ {
27
+ __local half local_src[88 * 1 * 77];
28
+ __local half local_dst[88 * 1];
29
+
30
+ event_t e1 = async_work_group_copy_2D2D(
31
+ local_src, // dst
32
+ probabilities, // src
33
+ width, // num_elements_per_line,
34
+ height * channels, // num_lines,
35
+ width * (height - 1), // src_line_stride,
36
+ width * (height - 1), // dst_line_stride,
37
+ 0);
38
+
39
+ wait_group_events(1, &e1);
40
+
41
+ const int T = channels; // Time
42
+ const int B = height; // Batches
43
+ const int C = width; // Chars
44
+
45
+ #pragma unroll 4
46
+ for (int i = 0; i < B * T; i++) {
47
+ local_dst[i] = -1.h;
48
+ }
49
+
50
+ int output_index = 0;
51
+
52
+ for (int b = 0; b < B; ++b) {
53
+ __global const half *restrict seq_ind = sequence_indicators + b * T;
54
+ const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
55
+ const int time = min(seq_len, T);
56
+
57
+ int prev_class_idx = -1;
58
+
59
+ #pragma unroll 4
60
+ for (int t = 0; t < time; ++t) {
61
+ __local const half *restrict probs = local_src + b * C + t * C * B;
62
+
63
+ int max_class_idx = 0;
64
+ half max_prob = probs[0];
65
+ for (int c = 1; c < C; ++c) {
66
+ const half prob = probs[c];
67
+ if (prob > max_prob) {
68
+ max_class_idx = c;
69
+ max_prob = prob;
70
+ }
71
+ }
72
+
73
+ if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) {
74
+ local_dst[b * T + output_index] = (half)max_class_idx;
75
+ output_index++;
76
+ }
77
+
78
+ prev_class_idx = max_class_idx;
79
+ }
80
+ }
81
+
82
+ barrier(CLK_LOCAL_MEM_FENCE);
83
+
84
+ event_t e2 = async_work_group_copy_2D2D(
85
+ output, // dst
86
+ local_dst, // src
87
+ channels, // num_elements_per_line,
88
+ height, // num_lines,
89
+ 0, // src_line_stride,
90
+ 0, // dst_line_stride,
91
+ 0);
92
+
93
+ wait_group_events(1, &e2);
94
+ }
openvino/vpu_custom_kernels/customLayerBindings.xml ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <CustomLayer name="ReorgYolo" type="MVCL" version="1">
2
+ <Kernel entry="reorg_hwc_naive">
3
+ <Source filename="reorg_hwc_naive.bin"/>
4
+ <Parameters>
5
+ <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
6
+ <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
7
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
8
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
9
+ <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
10
+ <Scalar arg-name="stride" type="int" source="stride"/>
11
+ </Parameters>
12
+ <WorkSizes dim="input,0" global="F,1,1" local="stride*stride,1,1"/>
13
+ </Kernel>
14
+ </CustomLayer>
15
+
16
+ <CustomLayer name="ReorgYolo" type="MVCL" version="1">
17
+ <Kernel entry="reorg_chw">
18
+ <Source filename="reorg_chw.bin"/>
19
+ <Parameters>
20
+ <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
21
+ <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
22
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
23
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
24
+ <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
25
+ <Scalar arg-name="stride" type="int" source="stride"/>
26
+ </Parameters>
27
+ <WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="stride,stride,1"/>
28
+ </Kernel>
29
+ </CustomLayer>
30
+
31
+ <CustomLayer name="RegionYolo" type="MVCL" version="1">
32
+ <Where do_softmax="1"/>
33
+ <Kernel entry="region_chw">
34
+ <Source filename="region_chw.bin"/>
35
+ <Parameters>
36
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
37
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
38
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
39
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
40
+ <Scalar arg-name="classes" type="int" source="classes"/>
41
+ <Scalar arg-name="coords" type="int" source="coords"/>
42
+ <Scalar arg-name="num" type="int" source="num"/>
43
+ <Scalar arg-name="maskSize" type="int" source="3"/>
44
+ <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
45
+ </Parameters>
46
+ <WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
47
+ </Kernel>
48
+ </CustomLayer>
49
+
50
+ <CustomLayer name="RegionYolo" type="MVCL" version="1">-->
51
+ <Where do_softmax="0" mask="0,1,2"/>
52
+ <Kernel entry="region_chw">
53
+ <Source filename="region_chw.bin"/>
54
+ <Parameters>
55
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
56
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
57
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
58
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
59
+ <Scalar arg-name="classes" type="int" source="classes"/>
60
+ <Scalar arg-name="coords" type="int" source="coords"/>
61
+ <Scalar arg-name="num" type="int" source="num"/>
62
+ <Scalar arg-name="maskSize" type="int" source="3"/>
63
+ <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
64
+ </Parameters>
65
+ <WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
66
+ </Kernel>
67
+ </CustomLayer>
68
+
69
+ <CustomLayer name="RegionYolo" type="MVCL" version="1">
70
+ <Where do_softmax="1"/>
71
+ <Kernel entry="region_hwc">
72
+ <Source filename="region_hwc.bin"/>
73
+ <Parameters>
74
+ <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
75
+ <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
76
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
77
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
78
+ <Scalar arg-name="classes" type="int" source="classes"/>
79
+ <Scalar arg-name="coords" type="int" source="coords"/>
80
+ <Scalar arg-name="num" type="int" source="num"/>
81
+ <Scalar arg-name="maskSize" type="int" source="3"/>
82
+ <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
83
+ </Parameters>
84
+ <WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
85
+ </Kernel>
86
+ </CustomLayer>
87
+
88
+ <CustomLayer name="RegionYolo" type="MVCL" version="1">
89
+ <Where do_softmax="0" mask="0,1,2"/>
90
+ <Kernel entry="region_hwc">
91
+ <Source filename="region_hwc.bin"/>
92
+ <Parameters>
93
+ <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
94
+ <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
95
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
96
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
97
+ <Scalar arg-name="classes" type="int" source="classes"/>
98
+ <Scalar arg-name="coords" type="int" source="coords"/>
99
+ <Scalar arg-name="num" type="int" source="num"/>
100
+ <Scalar arg-name="maskSize" type="int" source="3"/>
101
+ <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
102
+ </Parameters>
103
+ <WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
104
+ </Kernel>
105
+ </CustomLayer>
106
+
107
+ <!-- Pixel-wise kernel binding, local work group config is per line in the input tensor -->
108
+ <CustomLayer name="GRN" type="MVCL" version="1">
109
+ <Kernel entry="grn">
110
+ <Source filename="grn.bin"/>
111
+ <Parameters>
112
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
113
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
114
+ <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
115
+ <Scalar arg-name="bias" type="float" source="bias"/>
116
+ </Parameters>
117
+ <WorkSizes dim="input,0" global="X,Y,1" local="X,1,1"/>
118
+ </Kernel>
119
+ </CustomLayer>
120
+
121
+ <!-- Two stage layer binding, first kernel computes mean and variance, the second one normalizes input tensor-->
122
+ <CustomLayer name="MVN" type="MVCL" version="1">
123
+ <Kernel entry="reduction_mean" stage="0">
124
+ <Source filename="mvn_reduction.bin"/>
125
+ <Parameters>
126
+ <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
127
+ <Tensor arg-name="mean" type="output_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
128
+ <Tensor arg-name="variance" type="output_buffer" port-index="1" dim="output,0" size="Y*F*4"/>
129
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
130
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
131
+ <Scalar arg-name="across_channels" type="int" source="across_channels"/>
132
+ </Parameters>
133
+ <WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
134
+ </Kernel>
135
+ <Kernel entry="mvn_scale" stage="1">
136
+ <Source filename="mvn_scale.bin"/>
137
+ <Parameters>
138
+ <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
139
+ <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
140
+ <Tensor arg-name="mean_part" type="input_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
141
+ <Tensor arg-name="power_mean" type="input_buffer" port-index="1" dim="output,0" size="Y*F*4"/>
142
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
143
+ <Scalar arg-name="H1" type="int" port-index="0" source="I.Y"/>
144
+ <Scalar arg-name="across_channels" type="int" source="across_channels"/>
145
+ <Scalar arg-name="normalize_variance" type="int" source="normalize_variance"/>
146
+ <Scalar arg-name="nparts" type="int" port-index="0" source="I.Y"/>
147
+ </Parameters>
148
+ <WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
149
+ </Kernel>
150
+ </CustomLayer>
151
+
152
+ <!-- Single work group kernel for not embarrassingly-parallel use-case -->
153
+ <CustomLayer name="CTCGreedyDecoder" type="MVCL" version="1" max-shaves="1">
154
+ <Kernel entry="CTCDecoder">
155
+ <Source filename="ctc.bin"/>
156
+ <Parameters>
157
+ <Tensor arg-name="probabilities" type="input" port-index="0" format="FYX"/>
158
+ <Tensor arg-name="sequence_indicators" type="input" port-index="1" format="BF"/>
159
+ <Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
160
+ <Scalar arg-name="width" type="int" port-index="0" source="I.X"/>
161
+ <Scalar arg-name="height" type="int" port-index="0" source="I.Y"/>
162
+ <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
163
+ </Parameters>
164
+ <WorkSizes dim="output,0" global="1,1,1" local="1,1,1"/>
165
+ </Kernel>
166
+ </CustomLayer>
167
+
168
+ <CustomLayer name="ShuffleChannel" type="MVCL" version="1">
169
+ <!-- artificially added where closure for testing reasons, kernel itself supports arbitrary grouping -->
170
+ <!-- <Where group="2"/> -->
171
+ <Kernel entry="ShuffleChannel">
172
+ <Source filename="shuffle_channels.bin"/>
173
+ <Parameters>
174
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
175
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
176
+ <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
177
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
178
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
179
+ <Scalar arg-name="G" type="int" source="group"/>
180
+ </Parameters>
181
+ <WorkSizes dim="input,0" global="F,1,1" local="1,1,1"/>
182
+ </Kernel>
183
+ </CustomLayer>
184
+
185
+ <!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
186
+ <CustomLayer name="FakeQuantize" type="MVCL" version="1">
187
+ <Kernel entry="quantize">
188
+ <Source filename="fakequantize.bin"/>
189
+ <Parameters>
190
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
191
+ <Tensor arg-name="input_low" type="input" port-index="1" format="ANY"/>
192
+ <Tensor arg-name="input_high" type="input" port-index="2" format="ANY"/>
193
+ <Tensor arg-name="output_low" type="input" port-index="3" format="ANY"/>
194
+ <Tensor arg-name="output_high" type="input" port-index="4" format="ANY"/>
195
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
196
+ <Scalar arg-name="levels" type="int" source="levels"/>
197
+ <Scalar arg-name="input_low_size" type="int" port-index="1" source="I.F"/>
198
+ <Scalar arg-name="input_high_size" type="int" port-index="2" source="I.F"/>
199
+ <Scalar arg-name="output_low_size" type="int" port-index="3" source="I.F"/>
200
+ <Scalar arg-name="output_high_size" type="int" port-index="4" source="I.F"/>
201
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
202
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
203
+ </Parameters>
204
+ <WorkSizes dim="input,0" global="1,Y,F" local="1,Y,1"/>
205
+ </Kernel>
206
+ </CustomLayer>
207
+
208
+ <!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
209
+ <CustomLayer name="FakeQuantizeBin" type="MVCL" version="1">
210
+ <Where levels="2"/>
211
+ <Kernel entry="binarization">
212
+ <Source filename="binarization.bin"/>
213
+ <Parameters>
214
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
215
+ <Tensor arg-name="input_low_high" type="input" port-index="1" format="BFYX"/>
216
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
217
+ <Scalar arg-name="switch_out" type="int" source="switch_out"/>
218
+ <Scalar arg-name="input_low_high_size" type="int" source="input_low_size"/>
219
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
220
+ <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
221
+ <Tensor arg-name="input_high" type="input" port-index="2" format="BFYX"/>
222
+ <Tensor arg-name="output_low" type="input" port-index="3" format="BFYX"/>
223
+ <Tensor arg-name="output_high" type="input" port-index="4" format="BFYX"/>
224
+ <Scalar arg-name="input_high_size" type="int" source="input_high_size"/>
225
+ <Scalar arg-name="output_low_size" type="int" source="output_low_size"/>
226
+ <Scalar arg-name="output_high_size" type="int" source="output_high_size"/>
227
+ <Data arg-name="src_local" type="local_data" dim="input,0" size="X*Y*2"/>
228
+ <Data arg-name="dst_local" type="local_data" dim="input,0" size="X*Y*2"/>
229
+ </Parameters>
230
+ <WorkSizes dim="input,0" global="1,1,F" local="1,1,1"/>
231
+ </Kernel>
232
+ </CustomLayer>
233
+
234
+ <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
235
+ <Where kernel="3,3"/>
236
+ <Kernel entry="binary_convolution">
237
+ <Source filename="binary_convolution3x3.bin"/>
238
+ <Parameters>
239
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
240
+ <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
241
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
242
+ <Scalar arg-name="pad_value" type="float" source="pad_value"/>
243
+ <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
244
+ <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
245
+ <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
246
+ <Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
247
+ <Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
248
+ <Scalar arg-name="GC" type="int" source="group"/>
249
+ <Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
250
+ <Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
251
+ <Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
252
+ <Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
253
+ <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
254
+ <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
255
+ <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
256
+ </Parameters>
257
+ <WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
258
+ </Kernel>
259
+ </CustomLayer>
260
+
261
+ <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
262
+ <Where kernel="1,1"/>
263
+ <Kernel entry="binary_convolution">
264
+ <Source filename="binary_convolution1x1.bin"/>
265
+ <Parameters>
266
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
267
+ <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
268
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
269
+ <Scalar arg-name="pad_value" type="float" source="pad_value"/>
270
+ <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
271
+ <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
272
+ <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
273
+ <Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
274
+ <Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
275
+ <Scalar arg-name="GC" type="int" source="group"/>
276
+ <Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
277
+ <Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
278
+ <Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
279
+ <Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
280
+ <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
281
+ <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
282
+ <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
283
+ </Parameters>
284
+ <WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
285
+ </Kernel>
286
+ </CustomLayer>
287
+
288
+ <!-- Reference version of generic quantize binary convolution -->
289
+ <!-- An example of a kernel binding that uses data blob from IR -->
290
+ <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
291
+ <Kernel entry="binary_convolution">
292
+ <Source filename="binary_convolution.bin"/>
293
+ <Parameters>
294
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
295
+ <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
296
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
297
+ <Scalar arg-name="pad_value" type="float" source="pad_value"/>
298
+ <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
299
+ <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
300
+ <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
301
+ <Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
302
+ <Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
303
+ <Scalar arg-name="GC" type="int" source="group"/>
304
+ <Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
305
+ <Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
306
+ <Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
307
+ <Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
308
+ <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
309
+ <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
310
+ </Parameters>
311
+ <WorkSizes dim="output,0" global="X,Y,F" local="1,1,1"/>
312
+ </Kernel>
313
+ </CustomLayer>
314
+
315
+ <CustomLayer name="Resample" type="MVCL" version="1">
316
+ <Where antialias="0"/>
317
+ <Kernel entry="resample_nearest">
318
+ <Source filename="resample_noAA.bin"/>
319
+ <Parameters>
320
+ <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
321
+ <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
322
+ <Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
323
+ <Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
324
+ <Scalar arg-name="factor" type="float" source="factor"/>
325
+ <Scalar arg-name="ow" type="int" port-index="0" source="O.X"/>
326
+ <Scalar arg-name="oh" type="int" port-index="0" source="O.Y"/>
327
+ <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
328
+ </Parameters>
329
+ <WorkSizes global="1,Y,1" local="1,1,1" dim="output,0"/>
330
+ </Kernel>
331
+ </CustomLayer>
332
+
333
+ <CustomLayer name="Resample" type="MVCL" version="1">
334
+ <Where antialias="1"/>
335
+ <Kernel entry="resample_with_antialias">
336
+ <Source filename="resample_AA.bin"/>
337
+ <Parameters>
338
+ <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
339
+ <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
340
+ <Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
341
+ <Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
342
+ <Scalar arg-name="factor" type="float" source="factor"/>
343
+ <Scalar arg-name="ow" type="int" port-index="0" source="O.X"/>
344
+ <Scalar arg-name="oh" type="int" port-index="0" source="O.Y"/>
345
+ <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
346
+ </Parameters>
347
+ <WorkSizes global="1,round(Y*factor),F" local="1,1,F" dim="input,0"/>
348
+ </Kernel>
349
+ </CustomLayer>
350
+
351
+ <CustomLayer name="Convolution" type="MVCL" version="1">
352
+ <Where kernel="1,1" dilation="1,1"/>
353
+ <Kernel entry="Convolution1x1_NCHW">
354
+ <Source filename="convolution1x1_chw.bin"/>
355
+ <Parameters>
356
+ <Tensor arg-name="in" type="input" port-index="0" format="BFYX"/>
357
+ <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
358
+ <Data arg-name="w" type="data" source="weights" format="ANY"/>
359
+ <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
360
+ <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
361
+ <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
362
+ <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
363
+ <Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
364
+ <Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
365
+
366
+ <Scalar arg-name="stride-x" type="int" port-index="0" source="stride"/>
367
+ <Scalar arg-name="stride-y" type="int" port-index="1" source="stride"/>
368
+ <Scalar arg-name="pad-x" type="int" port-index="0" source="pads_begin"/>
369
+ <Scalar arg-name="pad-y" type="int" port-index="1" source="pads_begin"/>
370
+ <Scalar arg-name="kernel-x" type="int" port-index="0" source="kernel"/>
371
+ <Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
372
+ <Scalar arg-name="output" type="int" port-index="0" source="output"/>
373
+ <Scalar arg-name="group" type="int" port-index="0" source="group"/>
374
+ </Parameters>
375
+ <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
376
+ </Kernel>
377
+ </CustomLayer>
378
+
379
+ <CustomLayer name="Convolution" type="MVCL" version="1">
380
+ <Where kernel="1,1" dilation="1,1"/>
381
+ <Kernel entry="Convolution1x1_NHWC">
382
+ <Source filename="convolution1x1_hwc.bin"/>
383
+ <Parameters>
384
+ <Tensor arg-name="in" type="input" port-index="0" format="BYXF"/>
385
+ <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
386
+ <Data arg-name="w" type="data" source="weights" format="ANY"/>
387
+ <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
388
+ <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
389
+ <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
390
+ <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
391
+ <Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
392
+ <Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
393
+
394
+ <Scalar arg-name="stride-x" type="int" port-index="0" source="stride"/>
395
+ <Scalar arg-name="stride-y" type="int" port-index="1" source="stride"/>
396
+ <Scalar arg-name="pad-x" type="int" port-index="0" source="pads_begin"/>
397
+ <Scalar arg-name="pad-y" type="int" port-index="1" source="pads_begin"/>
398
+ <Scalar arg-name="kernel-x" type="int" port-index="0" source="kernel"/>
399
+ <Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
400
+ <Scalar arg-name="output" type="int" port-index="0" source="output"/>
401
+ <Scalar arg-name="group" type="int" port-index="0" source="group"/>
402
+ </Parameters>
403
+ <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
404
+ </Kernel>
405
+ </CustomLayer>
406
+
407
+ <CustomLayer name="Convolution" type="MVCL" version="1">
408
+ <Where kernel="3,3" dilation="1,1"/>
409
+ <Kernel entry="Convolution3x3">
410
+ <Source filename="convolution3x3.bin"/>
411
+ <Parameters>
412
+ <Tensor arg-name="in_param" type="input" port-index="0" format="BFYX"/>
413
+ <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
414
+ <Data arg-name="w" type="data" source="weights" format="BFYX"/>
415
+ <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
416
+ <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
417
+ <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
418
+ <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
419
+ <Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
420
+ <Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
421
+ <Scalar arg-name="KX" type="int" port-index="0" source="kernel"/>
422
+ <Scalar arg-name="KY" type="int" port-index="1" source="kernel"/>
423
+ <Scalar arg-name="stride_x" type="int" port-index="0" source="stride"/>
424
+ <Scalar arg-name="stride_y" type="int" port-index="1" source="stride"/>
425
+ <Scalar arg-name="pad_x" type="int" port-index="0" source="pads_begin"/>
426
+ <Scalar arg-name="pad_y" type="int" port-index="1" source="pads_begin"/>
427
+ <Scalar arg-name="dilation_x" type="int" port-index="0" source="dilation"/>
428
+ <Scalar arg-name="dilation_y" type="int" port-index="1" source="dilation"/>
429
+ <Scalar arg-name="output" type="int" port-index="0" source="output"/>
430
+
431
+ <Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*3*2"/>
432
+ <Data arg-name="out_local" type="local_data" dim="output,0" size="X*F*2"/>
433
+ <Data arg-name="w_local" type="local_data" dim="input,0" size="3*3*F*2"/>
434
+ </Parameters>
435
+ <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
436
+ </Kernel>
437
+ </CustomLayer>
438
+
439
+ <CustomLayer name="ExperimentalDetectronPriorGridGenerator" type="MVCL" version="1">
440
+ <Kernel entry="experimental_detectron_prior_grid_generator">
441
+ <Source filename="detectron_prior_grid_gen.bin"/>
442
+ <Parameters>
443
+ <Tensor arg-name="input_priors" type="input" port-index="0" format="BFYX"/>
444
+ <Tensor arg-name="input_feature_map" type="input" port-index="1" format="BFYX"/>
445
+ <Tensor arg-name="input_rois" type="input" port-index="2" format="BFYX"/>
446
+ <Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
447
+ <Scalar arg-name="grid_h" type="int" port-index="1" source="I.Y"/>
448
+ <Scalar arg-name="grid_w" type="int" port-index="1" source="I.X"/>
449
+ <Scalar arg-name="stride_h" type="float" source="stride_h"/>
450
+ <Scalar arg-name="stride_w" type="float" source="stride_w"/>
451
+ <Scalar arg-name="num_priors" type="int" port-index="0" source="I.Y"/>
452
+ <Scalar arg-name="num_anchors_per_prior" type="int" port-index="0" source="I.X"/>
453
+ </Parameters>
454
+ <WorkSizes dim="input,1" global="((X+31)/32)*32,Y,1" local="32,1,1"/>
455
+ </Kernel>
456
+ </CustomLayer>
457
+
458
+ <CustomLayer name="Convert" type="MVCL" version="1">
459
+ <Kernel entry="cvtu8f16">
460
+ <Source filename="cvtu8f16.bin"/>
461
+ <Parameters>
462
+ <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
463
+ <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
464
+ <Scalar arg-name="scale" type="float" source="scale"/>
465
+ <Scalar arg-name="bias" type="float" source="bias"/>
466
+ </Parameters>
467
+ <WorkSizes dim="input,0" global="X,Y,F" local="X,1,1"/>
468
+ </Kernel>
469
+ </CustomLayer>
470
+
471
+ <CustomLayer name="Correlate" type="MVCL" version="1">
472
+ <Kernel entry="correlate2_half">
473
+ <Source filename="correlate.bin"/>
474
+ <Parameters>
475
+ <Tensor arg-name="bottom0" type="input" port-index="0" format="BFYX"/>
476
+ <Tensor arg-name="bottom1" type="input" port-index="1" format="BFYX"/>
477
+ <Tensor arg-name="top" type="output" port-index="0" format="BFYX"/>
478
+ <Scalar arg-name="topwidth" type="int" source="top_width"/>
479
+ <Scalar arg-name="topheight" type="int" source="top_height"/>
480
+ <Scalar arg-name="bottomwidth" type="int" port-index="0" source="I.X"/>
481
+ <Scalar arg-name="bottomheight" type="int" port-index="0" source="I.Y"/>
482
+ <Scalar arg-name="bottomchannels" type="int" port-index="0" source="I.F"/>
483
+ <Scalar arg-name="max_displacement" type="int" source="displacement"/>
484
+ <Scalar arg-name="padding" type="int" source="pad"/>
485
+ <Scalar arg-name="neighborhood_grid_radius" type="int" source="neighborhood_grid_radius"/>
486
+ <Scalar arg-name="neighborhood_grid_width" type="int" source="neighborhood_grid_width"/>
487
+ <Scalar arg-name="kernel_size" type="int" source="kernel_size"/>
488
+ <Scalar arg-name="stride1" type="int" port-index="0" source="stride"/>
489
+ <Scalar arg-name="stride2" type="int" port-index="1" source="stride"/>
490
+ </Parameters>
491
+ <WorkSizes dim="input,0" global="top_height,1,1" local="1,1,1"/>
492
+ </Kernel>
493
+ </CustomLayer>
494
+
495
+ <CustomLayer name="SpatialTransform" type="MVCL" version="1">
496
+ <Kernel entry="ocl_st">
497
+ <Source filename="st.bin"/>
498
+ <Parameters>
499
+ <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
500
+ <Tensor arg-name="theta" type="input" port-index="1" format="ANY"/>
501
+ <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
502
+ <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
503
+ <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
504
+ </Parameters>
505
+ <WorkSizes dim="input,0" global="(X+511)/512,Y,1" local="1,1,1"/>
506
+ </Kernel>
507
+ </CustomLayer>
openvino/vpu_custom_kernels/cvtf32f16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:703ef56f84299e76d36b3ba5a632ae3d5e3ecd54761dcfe0006ca69ddce4bc6d
3
+ size 2664
openvino/vpu_custom_kernels/cvtf32f16.cl ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+
7
+ __kernel void cvtf32f16(const __global float* restrict inImage,
8
+ __global half* restrict outImage,
9
+ float scale,
10
+ float bais)
11
+ {
12
+ int idx = get_global_id(0)
13
+ + get_global_id(1) * get_global_size(0)
14
+ + get_global_id(2) * get_global_size(0) * get_global_size(1);
15
+
16
+ outImage[idx] = convert_half(inImage[idx]*scale+bais);
17
+ }
openvino/vpu_custom_kernels/cvtu8f16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46c943e08f37cedac77f727f55835637d4878edcc20aaa24f16ed5888d13bd43
3
+ size 4588
openvino/vpu_custom_kernels/cvtu8f16.cl ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias)
9
+ {
10
+ __local uchar local_src[8 * 1024];
11
+ __local half local_dst[8 * 1024];
12
+
13
+ event_t e1 = async_work_group_copy_3D3D(
14
+ local_src, // dst
15
+ src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
16
+ + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src
17
+ get_local_size(0), // num_elements_per_line
18
+ get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines
19
+ get_global_size(0) - get_local_size(0), // src_line_stride
20
+ 0, // dst_line_stride
21
+ get_local_size(2), // num planes
22
+ get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride
23
+ 0, // dst plane stride
24
+ 0);
25
+ wait_group_events(1, &e1);
26
+
27
+ size_t idx = get_local_id(0)
28
+ + get_local_id(1) * get_local_size(0)
29
+ + get_local_id(2) * get_local_size(0) * get_local_size(1);
30
+
31
+ local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias;
32
+
33
+ barrier(CLK_LOCAL_MEM_FENCE);
34
+
35
+ event_t e2 = async_work_group_copy_3D3D(
36
+ dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
37
+ + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst
38
+ local_dst, // src
39
+ get_local_size(0), // num_elements_per_line
40
+ get_local_size(1), // num_lines
41
+ 0, // src_line_stride
42
+ get_global_size(0) - get_local_size(0), // dst_line_stride
43
+ get_local_size(2), // num_planes
44
+ 0, // src_plane_stride
45
+ get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
46
+ 0);
47
+ wait_group_events(1, &e2);
48
+ }
openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c80d556d23f1c959fa10c00ff1cd9c3ae10aba607b37c7a0620d903fc7cedd8
3
+ size 6972
openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void experimental_detectron_prior_grid_generator(
9
+ __global const half *restrict input_priors,
10
+ __global const half *restrict input_feature_map,
11
+ __global const half *restrict input_rois,
12
+ __global half *restrict output,
13
+ int grid_h,
14
+ int grid_w,
15
+ float stride_h,
16
+ float stride_w,
17
+ int num_priors,
18
+ int num_anchors_per_prior)
19
+ {
20
+ __local half local_input_priors[8 * 1024];
21
+ __local half local_output[8 * 1024];
22
+
23
+ event_t e1 = async_work_group_copy(
24
+ local_input_priors,
25
+ input_priors,
26
+ num_anchors_per_prior * num_priors,
27
+ 0);
28
+ wait_group_events(1, &e1);
29
+
30
+ int width_start = get_group_id(0) * get_local_size(0);
31
+ int width_end = min(width_start + get_local_size(0), (unsigned)grid_w);
32
+ int width = width_end - width_start;
33
+
34
+ int h = get_group_id(1);
35
+ int w_idx = get_group_id(0) * get_local_size(0);
36
+ for (int w = 0; w < width; ++w) {
37
+ #pragma unroll 4
38
+ for (int p = 0; p < num_priors; ++p) {
39
+ local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
40
+ local_input_priors[4 * p + 0]
41
+ + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
42
+ local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
43
+ local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
44
+ local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
45
+ local_input_priors[4 * p + 2]
46
+ + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
47
+ local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
48
+ local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
49
+ }
50
+ }
51
+
52
+ barrier(CLK_LOCAL_MEM_FENCE);
53
+
54
+ event_t e2 = async_work_group_copy_2D2D(
55
+ output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
56
+ + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
57
+ * num_priors, // dst
58
+ local_output, // src
59
+ width * num_anchors_per_prior * num_priors, // num_elements_per_line
60
+ 1, // num_lines
61
+ (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
62
+ (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
63
+ 0);
64
+ wait_group_events(1, &e2);
65
+ }
openvino/vpu_custom_kernels/fakequantize.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17659bbf12a172849085003a055bfb4b91d3bb5bdc7f820395820eaa90b46ef
3
+ size 15688
openvino/vpu_custom_kernels/fakequantize.cl ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (C) 2018-2022 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+
5
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
+ #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
+
8
+ __kernel void quantize(
9
+ __global const half *restrict src_data,
10
+ __global const half *restrict input_low,
11
+ __global const half *restrict input_high,
12
+ __global const half *restrict output_low,
13
+ __global const half *restrict output_high,
14
+ __global half *restrict dst_data,
15
+ int levels,
16
+ int input_low_size,
17
+ int input_high_size,
18
+ int output_low_size,
19
+ int output_high_size,
20
+ int W,
21
+ int H)
22
+ {
23
+ __local half local_src[15 * 1024];
24
+ __local half local_dst[15 * 1024];
25
+
26
+ event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
27
+ wait_group_events(1, &e1);
28
+
29
+ int c = get_group_id(2);
30
+
31
+ half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]);
32
+ half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
33
+ half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]);
34
+ half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
35
+
36
+ half const1 = (half)(
37
+ !(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
38
+ half const2 =
39
+ (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
40
+
41
+ __local const half *restrict src = local_src + W * get_local_id(1);
42
+ __local half *restrict dst = local_dst + W * get_local_id(1);
43
+
44
+ for (int w = 0; w < W / 8; w++) {
45
+ half8 val = *((__local half8 *)src + w);
46
+ half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
47
+
48
+ aux = (half8){
49
+ (half)(short)(aux.s0),
50
+ (half)(short)(aux.s1),
51
+ (half)(short)(aux.s2),
52
+ (half)(short)(aux.s3),
53
+ (half)(short)(aux.s4),
54
+ (half)(short)(aux.s5),
55
+ (half)(short)(aux.s6),
56
+ (half)(short)(aux.s7)};
57
+
58
+ aux = aux * (half8)const2 + (half8)h_olow;
59
+
60
+ short8 a;
61
+ short8 b;
62
+ a.s0 = (val.s0 <= h_ilow);
63
+ a.s1 = (val.s1 <= h_ilow);
64
+ a.s2 = (val.s2 <= h_ilow);
65
+ a.s3 = (val.s3 <= h_ilow);
66
+ a.s4 = (val.s4 <= h_ilow);
67
+ a.s5 = (val.s5 <= h_ilow);
68
+ a.s6 = (val.s6 <= h_ilow);
69
+ a.s7 = (val.s7 <= h_ilow);
70
+
71
+ b.s0 = (val.s0 > h_ihigh);
72
+ b.s1 = (val.s1 > h_ihigh);
73
+ b.s2 = (val.s2 > h_ihigh);
74
+ b.s3 = (val.s3 > h_ihigh);
75
+ b.s4 = (val.s4 > h_ihigh);
76
+ b.s5 = (val.s5 > h_ihigh);
77
+ b.s6 = (val.s6 > h_ihigh);
78
+ b.s7 = (val.s7 > h_ihigh);
79
+
80
+ a = ~(a - (short8)1);
81
+ b = ~(b - (short8)1);
82
+
83
+ short8 c1 = (~a & b);
84
+ short8 c2 = (~a & ~b);
85
+
86
+ short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux));
87
+
88
+ *((__local half8 *)dst + w) = as_half8(res);
89
+ }
90
+
91
+ for (int w = W & (~0x7); w < W; w++) {
92
+ half val = src[w];
93
+ short a = val <= h_ilow;
94
+ a = ~(a - 1);
95
+ short b = val > h_ihigh;
96
+ b = ~(b - 1);
97
+
98
+ short c1 = (~a & b);
99
+ short c2 = (~a & ~b);
100
+
101
+ short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh))
102
+ | (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow)));
103
+
104
+ dst[w] = as_half(res);
105
+ }
106
+
107
+ barrier(CLK_LOCAL_MEM_FENCE);
108
+
109
+ event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
110
+ wait_group_events(1, &e2);
111
+ }
openvino/vpu_custom_kernels/grn.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e3dbe5173ca93f39fecaf29f820e1704bcb485affc1a09554e4c86f8de46214
3
+ size 7972