Spaces:
Running
Running
Zhu-FaceOnLive
commited on
Commit
·
81efcf0
1
Parent(s):
675ebfd
Upload 72 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +13 -0
- openvino/cache.json +0 -0
- openvino/libgna.so +3 -0
- openvino/libgna.so.2 +3 -0
- openvino/libgna.so.3.0.0.1455 +3 -0
- openvino/libopenvino.so +3 -0
- openvino/libopenvino_auto_batch_plugin.so +0 -0
- openvino/libopenvino_auto_plugin.so +0 -0
- openvino/libopenvino_c.so +0 -0
- openvino/libopenvino_gapi_preproc.so +3 -0
- openvino/libopenvino_hetero_plugin.so +0 -0
- openvino/libopenvino_intel_cpu_plugin.so +3 -0
- openvino/libopenvino_intel_gna_plugin.so +3 -0
- openvino/libopenvino_intel_hddl_plugin.so +3 -0
- openvino/libopenvino_intel_myriad_plugin.so +3 -0
- openvino/libopenvino_ir_frontend.so +0 -0
- openvino/libopenvino_onnx_frontend.so +3 -0
- openvino/libopenvino_paddle_frontend.so +0 -0
- openvino/libopenvino_tensorflow_fe.so +3 -0
- openvino/pcie-ma2x8x.mvcmd +3 -0
- openvino/plugins.xml +27 -0
- openvino/usb-ma2x8x.mvcmd +3 -0
- openvino/vpu_custom_kernels/binarization.bin +3 -0
- openvino/vpu_custom_kernels/binarization.cl +67 -0
- openvino/vpu_custom_kernels/binary_convolution.bin +3 -0
- openvino/vpu_custom_kernels/binary_convolution.cl +95 -0
- openvino/vpu_custom_kernels/binary_convolution1x1.bin +3 -0
- openvino/vpu_custom_kernels/binary_convolution1x1.cl +117 -0
- openvino/vpu_custom_kernels/binary_convolution3x3.bin +3 -0
- openvino/vpu_custom_kernels/binary_convolution3x3.cl +278 -0
- openvino/vpu_custom_kernels/convolution1x1_chw.bin +3 -0
- openvino/vpu_custom_kernels/convolution1x1_chw.cl +114 -0
- openvino/vpu_custom_kernels/convolution1x1_hwc.bin +3 -0
- openvino/vpu_custom_kernels/convolution1x1_hwc.cl +126 -0
- openvino/vpu_custom_kernels/convolution3x3.bin +3 -0
- openvino/vpu_custom_kernels/convolution3x3.cl +158 -0
- openvino/vpu_custom_kernels/correlate.bin +3 -0
- openvino/vpu_custom_kernels/correlate.cl +453 -0
- openvino/vpu_custom_kernels/ctc.bin +3 -0
- openvino/vpu_custom_kernels/ctc.cl +94 -0
- openvino/vpu_custom_kernels/customLayerBindings.xml +507 -0
- openvino/vpu_custom_kernels/cvtf32f16.bin +3 -0
- openvino/vpu_custom_kernels/cvtf32f16.cl +17 -0
- openvino/vpu_custom_kernels/cvtu8f16.bin +3 -0
- openvino/vpu_custom_kernels/cvtu8f16.cl +48 -0
- openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin +3 -0
- openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl +65 -0
- openvino/vpu_custom_kernels/fakequantize.bin +3 -0
- openvino/vpu_custom_kernels/fakequantize.cl +111 -0
- openvino/vpu_custom_kernels/grn.bin +3 -0
.gitattributes
CHANGED
@@ -34,3 +34,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
facewrapper/libs/libttvfaceengine6.so filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
facewrapper/libs/libttvfaceengine6.so filter=lfs diff=lfs merge=lfs -text
|
37 |
+
openvino/libgna.so filter=lfs diff=lfs merge=lfs -text
|
38 |
+
openvino/libgna.so.2 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
openvino/libgna.so.3.0.0.1455 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
openvino/libopenvino_gapi_preproc.so filter=lfs diff=lfs merge=lfs -text
|
41 |
+
openvino/libopenvino_intel_cpu_plugin.so filter=lfs diff=lfs merge=lfs -text
|
42 |
+
openvino/libopenvino_intel_gna_plugin.so filter=lfs diff=lfs merge=lfs -text
|
43 |
+
openvino/libopenvino_intel_hddl_plugin.so filter=lfs diff=lfs merge=lfs -text
|
44 |
+
openvino/libopenvino_intel_myriad_plugin.so filter=lfs diff=lfs merge=lfs -text
|
45 |
+
openvino/libopenvino_onnx_frontend.so filter=lfs diff=lfs merge=lfs -text
|
46 |
+
openvino/libopenvino_tensorflow_fe.so filter=lfs diff=lfs merge=lfs -text
|
47 |
+
openvino/libopenvino.so filter=lfs diff=lfs merge=lfs -text
|
48 |
+
openvino/pcie-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
|
49 |
+
openvino/usb-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
|
openvino/cache.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
openvino/libgna.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
|
3 |
+
size 3120536
|
openvino/libgna.so.2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
|
3 |
+
size 3120536
|
openvino/libgna.so.3.0.0.1455
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
|
3 |
+
size 3120536
|
openvino/libopenvino.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46
|
3 |
+
size 13495416
|
openvino/libopenvino_auto_batch_plugin.so
ADDED
Binary file (391 kB). View file
|
|
openvino/libopenvino_auto_plugin.so
ADDED
Binary file (371 kB). View file
|
|
openvino/libopenvino_c.so
ADDED
Binary file (305 kB). View file
|
|
openvino/libopenvino_gapi_preproc.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0
|
3 |
+
size 1312920
|
openvino/libopenvino_hetero_plugin.so
ADDED
Binary file (367 kB). View file
|
|
openvino/libopenvino_intel_cpu_plugin.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18
|
3 |
+
size 33299880
|
openvino/libopenvino_intel_gna_plugin.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab
|
3 |
+
size 4067016
|
openvino/libopenvino_intel_hddl_plugin.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b
|
3 |
+
size 5894680
|
openvino/libopenvino_intel_myriad_plugin.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de
|
3 |
+
size 6120168
|
openvino/libopenvino_ir_frontend.so
ADDED
Binary file (343 kB). View file
|
|
openvino/libopenvino_onnx_frontend.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441
|
3 |
+
size 3781640
|
openvino/libopenvino_paddle_frontend.so
ADDED
Binary file (987 kB). View file
|
|
openvino/libopenvino_tensorflow_fe.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9
|
3 |
+
size 2723864
|
openvino/pcie-ma2x8x.mvcmd
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d
|
3 |
+
size 2099248
|
openvino/plugins.xml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<ie>
|
2 |
+
<plugins>
|
3 |
+
<plugin name="AUTO" location="libopenvino_auto_plugin.so">
|
4 |
+
<properties>
|
5 |
+
<property key="MULTI_WORK_MODE_AS_AUTO" value="YES"/>
|
6 |
+
</properties>
|
7 |
+
</plugin>
|
8 |
+
<plugin name="BATCH" location="libopenvino_auto_batch_plugin.so">
|
9 |
+
</plugin>
|
10 |
+
<plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
|
11 |
+
</plugin>
|
12 |
+
<plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
|
13 |
+
</plugin>
|
14 |
+
<plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
|
15 |
+
</plugin>
|
16 |
+
<plugin name="HETERO" location="libopenvino_hetero_plugin.so">
|
17 |
+
</plugin>
|
18 |
+
<plugin name="MULTI" location="libopenvino_auto_plugin.so">
|
19 |
+
</plugin>
|
20 |
+
<plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
|
21 |
+
</plugin>
|
22 |
+
<plugin name="HDDL" location="libopenvino_intel_hddl_plugin.so">
|
23 |
+
</plugin>
|
24 |
+
<plugin name="VPUX" location="libopenvino_intel_vpux_plugin.so">
|
25 |
+
</plugin>
|
26 |
+
</plugins>
|
27 |
+
</ie>
|
openvino/usb-ma2x8x.mvcmd
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e
|
3 |
+
size 2298632
|
openvino/vpu_custom_kernels/binarization.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac
|
3 |
+
size 19632
|
openvino/vpu_custom_kernels/binarization.cl
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void binarization(
|
9 |
+
const __global half *__restrict src_data,
|
10 |
+
const __global half *__restrict input_low_high,
|
11 |
+
const __global half *__restrict dst_data,
|
12 |
+
int switch_out,
|
13 |
+
int input_low_high_size,
|
14 |
+
int W,
|
15 |
+
int H)
|
16 |
+
{
|
17 |
+
__local half local_src[15 * 1024];
|
18 |
+
__local half local_dst[15 * 1024];
|
19 |
+
|
20 |
+
event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
|
21 |
+
wait_group_events(1, &e1);
|
22 |
+
|
23 |
+
int c = get_global_id(2);
|
24 |
+
int C = get_global_size(2);
|
25 |
+
|
26 |
+
half dst_low = switch_out ? 1.h : -1.h;
|
27 |
+
half dst_high = switch_out ? -1.h : 1.h;
|
28 |
+
|
29 |
+
half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
|
30 |
+
|
31 |
+
for (int h = 0; h < H; h++) {
|
32 |
+
|
33 |
+
__local const half *__restrict addr_src = local_src + h * W;
|
34 |
+
__local half *__restrict addr_dst = local_dst + h * W;
|
35 |
+
|
36 |
+
#if 1
|
37 |
+
for (int w = 0; w < W / 8; w++) {
|
38 |
+
|
39 |
+
half8 h_src_val8 = (*((__local half8 *)addr_src + w));
|
40 |
+
|
41 |
+
short8 cond1;
|
42 |
+
cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
|
43 |
+
cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
|
44 |
+
cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
|
45 |
+
cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
|
46 |
+
cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
|
47 |
+
cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
|
48 |
+
cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
|
49 |
+
cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
|
50 |
+
|
51 |
+
cond1 = ~(cond1 - (short8)1);
|
52 |
+
|
53 |
+
short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
|
54 |
+
|
55 |
+
*((__local half8 *)addr_dst + w) = as_half8(res);
|
56 |
+
}
|
57 |
+
#endif
|
58 |
+
for (int w = W & (~0x7); w < W; w++) {
|
59 |
+
addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
|
60 |
+
}
|
61 |
+
}
|
62 |
+
|
63 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
64 |
+
|
65 |
+
event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
|
66 |
+
wait_group_events(1, &e2);
|
67 |
+
}
|
openvino/vpu_custom_kernels/binary_convolution.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a
|
3 |
+
size 10764
|
openvino/vpu_custom_kernels/binary_convolution.cl
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
|
7 |
+
int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
|
8 |
+
|
9 |
+
__kernel void binary_convolution(
|
10 |
+
const __global half *restrict src_data,
|
11 |
+
const __global uchar *restrict weights_data,
|
12 |
+
__global half *restrict dst_data,
|
13 |
+
float pad_value,
|
14 |
+
|
15 |
+
int IW,
|
16 |
+
int IH,
|
17 |
+
int IC,
|
18 |
+
|
19 |
+
int DW,
|
20 |
+
int DH,
|
21 |
+
|
22 |
+
int GC,
|
23 |
+
|
24 |
+
int KW,
|
25 |
+
int KH,
|
26 |
+
|
27 |
+
int PW,
|
28 |
+
int PH,
|
29 |
+
|
30 |
+
int SW,
|
31 |
+
int SH)
|
32 |
+
{
|
33 |
+
int ipad_value = ((pad_value > 0.f) ? 1 : 0);
|
34 |
+
int c = get_global_id(2);
|
35 |
+
int y = get_global_id(1);
|
36 |
+
int x = get_global_id(0);
|
37 |
+
|
38 |
+
int OC = get_global_size(2);
|
39 |
+
int OH = get_global_size(1);
|
40 |
+
int OW = get_global_size(0);
|
41 |
+
|
42 |
+
int KD = 1;
|
43 |
+
int SD = 0;
|
44 |
+
int DD = 0;
|
45 |
+
int PD = 0;
|
46 |
+
int ID = 1;
|
47 |
+
int OD = 1;
|
48 |
+
|
49 |
+
int nbits = 8;
|
50 |
+
|
51 |
+
int g = c % GC;
|
52 |
+
int oc = c / GC;
|
53 |
+
int oh = y;
|
54 |
+
int ow = x;
|
55 |
+
|
56 |
+
for (int od = 0; od < OD; od++) {
|
57 |
+
int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
|
58 |
+
|
59 |
+
int res = 0;
|
60 |
+
|
61 |
+
for (int ic = 0; ic < IC / GC; ic++) {
|
62 |
+
for (int kd = 0; kd < KD; kd++) {
|
63 |
+
for (int kh = 0; kh < KH; kh++) {
|
64 |
+
for (int kw = 0; kw < KW; kw++) {
|
65 |
+
int widx = g * OC / GC * IC / GC * KD * KH * KW
|
66 |
+
+ oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
|
67 |
+
+ kh * KW + kw;
|
68 |
+
|
69 |
+
int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
|
70 |
+
|
71 |
+
int s;
|
72 |
+
|
73 |
+
int iw = ow * SW - PW + kw * DW;
|
74 |
+
int ih = oh * SH - PH + kh * DH;
|
75 |
+
int id = od * SD - PD + kd * DD;
|
76 |
+
|
77 |
+
if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
|
78 |
+
|| id >= (int)ID) {
|
79 |
+
s = ipad_value;
|
80 |
+
} else {
|
81 |
+
int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
|
82 |
+
+ ih * IW + iw;
|
83 |
+
|
84 |
+
s = ((src_data[iidx] > 0.f) ? 1 : 0);
|
85 |
+
}
|
86 |
+
|
87 |
+
res += s ^ w;
|
88 |
+
}
|
89 |
+
}
|
90 |
+
}
|
91 |
+
}
|
92 |
+
|
93 |
+
dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
|
94 |
+
}
|
95 |
+
}
|
openvino/vpu_custom_kernels/binary_convolution1x1.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945
|
3 |
+
size 76316
|
openvino/vpu_custom_kernels/binary_convolution1x1.cl
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
|
9 |
+
|
10 |
+
__kernel void binary_convolution(
|
11 |
+
const __global half *restrict src_data,
|
12 |
+
const __global uchar *restrict weights_data,
|
13 |
+
__global half *restrict dst_data,
|
14 |
+
float pad_value,
|
15 |
+
|
16 |
+
int IW,
|
17 |
+
int IH,
|
18 |
+
int IC,
|
19 |
+
|
20 |
+
int DW,
|
21 |
+
int DH,
|
22 |
+
|
23 |
+
int GC,
|
24 |
+
|
25 |
+
int KW,
|
26 |
+
int KH,
|
27 |
+
|
28 |
+
int PW,
|
29 |
+
int PH,
|
30 |
+
|
31 |
+
int SW,
|
32 |
+
int SH,
|
33 |
+
|
34 |
+
int OW)
|
35 |
+
{
|
36 |
+
__local half src_local[32 * 1024];
|
37 |
+
__local half dst_local[2 * 1024];
|
38 |
+
|
39 |
+
const int oh = get_group_id(0);
|
40 |
+
const int oc = get_group_id(1);
|
41 |
+
const int OH = get_global_size(0);
|
42 |
+
const int OC = get_global_size(1);
|
43 |
+
|
44 |
+
const int gc = oc / (OC / GC);
|
45 |
+
|
46 |
+
if (oh * SH >= 0 && oh * SH <= IH - 1) {
|
47 |
+
const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
|
48 |
+
|
49 |
+
event_t e1 = async_work_group_copy_2D2D(
|
50 |
+
src_local, // dst
|
51 |
+
src, // src
|
52 |
+
IW, // num_elements_per_line,
|
53 |
+
IC / GC, // num_lines,
|
54 |
+
IH * IW - IW, // src_line_stride,
|
55 |
+
0, // dst_line_stride,
|
56 |
+
0);
|
57 |
+
wait_group_events(1, &e1);
|
58 |
+
}
|
59 |
+
|
60 |
+
half pad_value_half = convert_half(pad_value);
|
61 |
+
|
62 |
+
//padding row
|
63 |
+
if (oh * SH > IH - 1) {
|
64 |
+
__local half *dst = src_local;
|
65 |
+
for (int c = 0; c < IC / GC; c++) {
|
66 |
+
#pragma unroll 8
|
67 |
+
for (int j = 0; j < IW; j++) {
|
68 |
+
dst[j] = pad_value_half;
|
69 |
+
}
|
70 |
+
dst += IW;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
|
74 |
+
int OWS = SW * OW;
|
75 |
+
ushort8 in;
|
76 |
+
|
77 |
+
for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
|
78 |
+
ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
|
79 |
+
for (int ic = 0; ic < IC / GC; ++ic) {
|
80 |
+
__local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
|
81 |
+
int weight_pos = oc * IC / GC + ic;
|
82 |
+
ushort w =
|
83 |
+
extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
|
84 |
+
|
85 |
+
if ((ows8 * 8) <= IW - 1) {
|
86 |
+
in = *((__local ushort8 *)(src));
|
87 |
+
}
|
88 |
+
|
89 |
+
//padding column
|
90 |
+
if (ows8 * 8 + 7 > IW - 1) {
|
91 |
+
int boundary = (IW - 1) - ows8 * 8 + 1;
|
92 |
+
boundary = boundary < 0 ? 0 : boundary;
|
93 |
+
for (int offset = boundary; offset < 8; offset++) {
|
94 |
+
*((half *)(&in) + offset) = pad_value_half;
|
95 |
+
}
|
96 |
+
}
|
97 |
+
|
98 |
+
ushort8 w8 = (ushort8)(w);
|
99 |
+
|
100 |
+
ushort8 cond =
|
101 |
+
(((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
102 |
+
|
103 |
+
val += (cond ^ w8);
|
104 |
+
}
|
105 |
+
|
106 |
+
ushort8 val_shift = val << 1;
|
107 |
+
int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
|
108 |
+
for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
|
109 |
+
*(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
|
110 |
+
}
|
111 |
+
}
|
112 |
+
|
113 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
114 |
+
|
115 |
+
event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
|
116 |
+
wait_group_events(1, &e2);
|
117 |
+
}
|
openvino/vpu_custom_kernels/binary_convolution3x3.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb
|
3 |
+
size 104136
|
openvino/vpu_custom_kernels/binary_convolution3x3.cl
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
|
9 |
+
|
10 |
+
__kernel void binary_convolution(
|
11 |
+
const __global half *restrict src_data,
|
12 |
+
const __global uchar *restrict weights_data,
|
13 |
+
const __global half *restrict dst_data,
|
14 |
+
float pad_value,
|
15 |
+
|
16 |
+
int IW,
|
17 |
+
int IH,
|
18 |
+
int IC,
|
19 |
+
|
20 |
+
int DW,
|
21 |
+
int DH,
|
22 |
+
|
23 |
+
int GC,
|
24 |
+
|
25 |
+
int KW,
|
26 |
+
int KH,
|
27 |
+
|
28 |
+
int PW,
|
29 |
+
int PH,
|
30 |
+
|
31 |
+
int SW,
|
32 |
+
int SH,
|
33 |
+
|
34 |
+
int OW)
|
35 |
+
{
|
36 |
+
__local half src_local[32 * 1024];
|
37 |
+
__local half dst_local[2 * 1024];
|
38 |
+
|
39 |
+
const int oh = get_group_id(0);
|
40 |
+
const int oc = get_group_id(1);
|
41 |
+
const int OH = get_global_size(0);
|
42 |
+
const int OC = get_global_size(1);
|
43 |
+
|
44 |
+
const int gc = oc / (OC / GC);
|
45 |
+
|
46 |
+
if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
|
47 |
+
{
|
48 |
+
event_t e = async_work_group_copy_3D3D(
|
49 |
+
src_local, // dst
|
50 |
+
src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
|
51 |
+
IW, // num_elements_per_line
|
52 |
+
3, // num_lines
|
53 |
+
DH * IW - IW, // src_line_stride
|
54 |
+
0, // dst_line_stride
|
55 |
+
IC / GC, // num planes
|
56 |
+
IH * IW - 3 * DH * IW, // src plane stride
|
57 |
+
0, // dst plane stride
|
58 |
+
0);
|
59 |
+
wait_group_events(1, &e);
|
60 |
+
} else {
|
61 |
+
int ih = oh * SH - 1;
|
62 |
+
if (ih >= 0 && ih <= IH - 1) //dma for first row
|
63 |
+
{
|
64 |
+
event_t e = async_work_group_copy_2D2D(
|
65 |
+
src_local, // dst
|
66 |
+
src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
|
67 |
+
IW, // num_elements_per_line,
|
68 |
+
IC / GC, // num_lines,
|
69 |
+
IH * IW - IW, // src_line_stride,
|
70 |
+
2 * IW, // dst_line_stride,
|
71 |
+
0);
|
72 |
+
|
73 |
+
wait_group_events(1, &e);
|
74 |
+
}
|
75 |
+
ih = oh * SH - 1 + DH;
|
76 |
+
if (ih >= 0 && ih <= IH - 1) //dma for second row
|
77 |
+
{
|
78 |
+
event_t e = async_work_group_copy_2D2D(
|
79 |
+
src_local + IW, // dst
|
80 |
+
src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
|
81 |
+
IW, // num_elements_per_line,
|
82 |
+
IC / GC, // num_lines,
|
83 |
+
IH * IW - IW, // src_line_stride,
|
84 |
+
2 * IW, // dst_line_stride,
|
85 |
+
0);
|
86 |
+
wait_group_events(1, &e);
|
87 |
+
}
|
88 |
+
ih = oh * SH - 1 + 2 * DH;
|
89 |
+
if (ih >= 0 && ih <= IH - 1) //dma for third row
|
90 |
+
{
|
91 |
+
event_t e = async_work_group_copy_2D2D(
|
92 |
+
src_local + 2 * IW, // dst
|
93 |
+
src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
|
94 |
+
IW, // num_elements_per_line,
|
95 |
+
IC / GC, // num_lines,
|
96 |
+
IH * IW - IW, // src_line_stride,
|
97 |
+
2 * IW, // dst_line_stride,
|
98 |
+
0);
|
99 |
+
wait_group_events(1, &e);
|
100 |
+
}
|
101 |
+
}
|
102 |
+
|
103 |
+
half pad_value_half = convert_half(pad_value);
|
104 |
+
|
105 |
+
//padding row
|
106 |
+
if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
|
107 |
+
__local half *dst = src_local;
|
108 |
+
for (int c = 0; c < IC / GC; c++) {
|
109 |
+
#pragma unroll 8
|
110 |
+
for (int j = 0; j < IW; j++) {
|
111 |
+
dst[j] = pad_value_half;
|
112 |
+
}
|
113 |
+
dst += 3 * IW;
|
114 |
+
}
|
115 |
+
}
|
116 |
+
if (oh * SH + DH - 1 > IH - 1) {
|
117 |
+
__local half *dst = src_local + IW;
|
118 |
+
for (int c = 0; c < IC / GC; c++) {
|
119 |
+
#pragma unroll 8
|
120 |
+
for (int j = 0; j < IW; j++) {
|
121 |
+
dst[j] = pad_value_half;
|
122 |
+
}
|
123 |
+
dst += 3 * IW;
|
124 |
+
}
|
125 |
+
}
|
126 |
+
if (oh * SH + DH + DH - 1 > IH - 1) {
|
127 |
+
__local half *dst = src_local + 2 * IW;
|
128 |
+
for (int c = 0; c < IC / GC; c++) {
|
129 |
+
#pragma unroll 8
|
130 |
+
for (int j = 0; j < IW; j++) {
|
131 |
+
dst[j] = pad_value_half;
|
132 |
+
}
|
133 |
+
dst += 3 * IW;
|
134 |
+
}
|
135 |
+
}
|
136 |
+
|
137 |
+
int OWS = SW * OW;
|
138 |
+
|
139 |
+
ushort8 in00;
|
140 |
+
ushort8 in01;
|
141 |
+
ushort8 in02;
|
142 |
+
ushort8 in10;
|
143 |
+
ushort8 in11;
|
144 |
+
ushort8 in12;
|
145 |
+
ushort8 in20;
|
146 |
+
ushort8 in21;
|
147 |
+
ushort8 in22;
|
148 |
+
|
149 |
+
for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
|
150 |
+
ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
|
151 |
+
for (int ic = 0; ic < IC / GC; ++ic) {
|
152 |
+
__local half *src =
|
153 |
+
(__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
|
154 |
+
int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
|
155 |
+
ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
|
156 |
+
ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
|
157 |
+
ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
|
158 |
+
ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8));
|
159 |
+
ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8));
|
160 |
+
ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8));
|
161 |
+
ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8));
|
162 |
+
ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
|
163 |
+
ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
|
164 |
+
|
165 |
+
if ((ows8 * 8) - 1 <= IW - 1) {
|
166 |
+
in00 = *((__local ushort8 *)(src - IW - DW));
|
167 |
+
in01 = *((__local ushort8 *)(src - IW));
|
168 |
+
in02 = *((__local ushort8 *)(src - IW + DW));
|
169 |
+
|
170 |
+
in10 = *((__local ushort8 *)(src - DW));
|
171 |
+
in11 = *((__local ushort8 *)(src));
|
172 |
+
in12 = *((__local ushort8 *)(src + DW));
|
173 |
+
|
174 |
+
in20 = *((__local ushort8 *)(src + IW - DW));
|
175 |
+
in21 = *((__local ushort8 *)(src + IW));
|
176 |
+
in22 = *((__local ushort8 *)(src + IW + DW));
|
177 |
+
}
|
178 |
+
|
179 |
+
//padding column
|
180 |
+
if (ows8 * 8 - 1 < 0) {
|
181 |
+
int boundary = 1 - ows8 * 8;
|
182 |
+
boundary = boundary > 8 ? 8 : boundary;
|
183 |
+
for (int offset = 0; offset < boundary; offset++) {
|
184 |
+
*((half *)(&in00) + offset) = pad_value_half;
|
185 |
+
*((half *)(&in10) + offset) = pad_value_half;
|
186 |
+
*((half *)(&in20) + offset) = pad_value_half;
|
187 |
+
}
|
188 |
+
}
|
189 |
+
if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
|
190 |
+
int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
|
191 |
+
boundary = boundary < 0 ? 0 : boundary;
|
192 |
+
for (int offset = boundary; offset < 8; offset++) {
|
193 |
+
*((half *)(&in02) + offset) = pad_value_half;
|
194 |
+
*((half *)(&in12) + offset) = pad_value_half;
|
195 |
+
*((half *)(&in22) + offset) = pad_value_half;
|
196 |
+
}
|
197 |
+
}
|
198 |
+
if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
|
199 |
+
int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
|
200 |
+
boundary = boundary < 0 ? 0 : boundary;
|
201 |
+
for (int offset = boundary; offset < 8; offset++) {
|
202 |
+
*((half *)(&in01) + offset) = pad_value_half;
|
203 |
+
*((half *)(&in11) + offset) = pad_value_half;
|
204 |
+
*((half *)(&in21) + offset) = pad_value_half;
|
205 |
+
}
|
206 |
+
}
|
207 |
+
if ((ows8 * 8 + 7) - 1 > IW - 1) {
|
208 |
+
int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
|
209 |
+
boundary = boundary < 0 ? 0 : boundary;
|
210 |
+
for (int offset = boundary; offset < 8; offset++) {
|
211 |
+
*((half *)(&in00) + offset) = pad_value_half;
|
212 |
+
*((half *)(&in10) + offset) = pad_value_half;
|
213 |
+
*((half *)(&in20) + offset) = pad_value_half;
|
214 |
+
}
|
215 |
+
}
|
216 |
+
|
217 |
+
ushort8 w00 = (ushort8)(w0);
|
218 |
+
ushort8 w01 = (ushort8)(w1);
|
219 |
+
ushort8 w02 = (ushort8)(w2);
|
220 |
+
ushort8 w10 = (ushort8)(w3);
|
221 |
+
ushort8 w11 = (ushort8)(w4);
|
222 |
+
ushort8 w12 = (ushort8)(w5);
|
223 |
+
ushort8 w20 = (ushort8)(w6);
|
224 |
+
ushort8 w21 = (ushort8)(w7);
|
225 |
+
ushort8 w22 = (ushort8)(w8);
|
226 |
+
|
227 |
+
ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
|
228 |
+
(ushort8)(1) :
|
229 |
+
(ushort8)(0);
|
230 |
+
ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
|
231 |
+
(ushort8)(1) :
|
232 |
+
(ushort8)(0);
|
233 |
+
ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
|
234 |
+
(ushort8)(1) :
|
235 |
+
(ushort8)(0);
|
236 |
+
ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
|
237 |
+
(ushort8)(1) :
|
238 |
+
(ushort8)(0);
|
239 |
+
ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
|
240 |
+
(ushort8)(1) :
|
241 |
+
(ushort8)(0);
|
242 |
+
ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
|
243 |
+
(ushort8)(1) :
|
244 |
+
(ushort8)(0);
|
245 |
+
ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
|
246 |
+
(ushort8)(1) :
|
247 |
+
(ushort8)(0);
|
248 |
+
ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
|
249 |
+
(ushort8)(1) :
|
250 |
+
(ushort8)(0);
|
251 |
+
ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
|
252 |
+
(ushort8)(1) :
|
253 |
+
(ushort8)(0);
|
254 |
+
|
255 |
+
val += (cond0 ^ w00);
|
256 |
+
val += (cond1 ^ w01);
|
257 |
+
val += (cond2 ^ w02);
|
258 |
+
val += (cond3 ^ w10);
|
259 |
+
val += (cond4 ^ w11);
|
260 |
+
val += (cond5 ^ w12);
|
261 |
+
val += (cond6 ^ w20);
|
262 |
+
val += (cond7 ^ w21);
|
263 |
+
val += (cond8 ^ w22);
|
264 |
+
}
|
265 |
+
|
266 |
+
ushort8 val_shift = val << 1;
|
267 |
+
int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
|
268 |
+
for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
|
269 |
+
*(dst_local + ow) =
|
270 |
+
(half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
|
271 |
+
}
|
272 |
+
}
|
273 |
+
|
274 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
275 |
+
|
276 |
+
event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
|
277 |
+
wait_group_events(1, &e2);
|
278 |
+
}
|
openvino/vpu_custom_kernels/convolution1x1_chw.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f
|
3 |
+
size 9220
|
openvino/vpu_custom_kernels/convolution1x1_chw.cl
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void Convolution1x1_NCHW(
|
9 |
+
const __global half *in,
|
10 |
+
const __global half *out,
|
11 |
+
const __global half *w,
|
12 |
+
int IW,
|
13 |
+
int IH,
|
14 |
+
int IC,
|
15 |
+
int OW,
|
16 |
+
int OH,
|
17 |
+
int OC)
|
18 |
+
{
|
19 |
+
__local half in_local[8 * 1024];
|
20 |
+
__local half out_local[8 * 1024];
|
21 |
+
|
22 |
+
event_t e1 = async_work_group_copy_2D2D(
|
23 |
+
in_local, // dst
|
24 |
+
in + get_group_id(0) * IW, // src
|
25 |
+
IW, // num_elements_per_line,
|
26 |
+
IC, // num_lines,
|
27 |
+
IW * IH - IW, // src_line_stride,
|
28 |
+
0, // dst_line_stride,
|
29 |
+
0);
|
30 |
+
wait_group_events(1, &e1);
|
31 |
+
|
32 |
+
int oh = get_global_id(0);
|
33 |
+
int oc = get_global_id(1);
|
34 |
+
|
35 |
+
int stride;
|
36 |
+
int write_output = 0;
|
37 |
+
__global half *src;
|
38 |
+
|
39 |
+
__global half8 *w8 = (__global half8 *)(&w[oc * IC]);
|
40 |
+
__global half *w1 = (__global half *)(&w[oc * IC]);
|
41 |
+
|
42 |
+
for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
|
43 |
+
uint iw = ow;
|
44 |
+
uint ih = oh;
|
45 |
+
|
46 |
+
half8 val8_0 = 0.0f;
|
47 |
+
|
48 |
+
__local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
|
49 |
+
__local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
|
50 |
+
__local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
|
51 |
+
__local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
|
52 |
+
__local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
|
53 |
+
__local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
|
54 |
+
__local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
|
55 |
+
__local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
|
56 |
+
|
57 |
+
for (uint ic = 0; ic < IC / 8; ic++) {
|
58 |
+
val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
|
59 |
+
val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
|
60 |
+
val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
|
61 |
+
val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
|
62 |
+
val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
|
63 |
+
val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
|
64 |
+
val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
|
65 |
+
val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
|
66 |
+
}
|
67 |
+
|
68 |
+
for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
|
69 |
+
val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
|
70 |
+
}
|
71 |
+
*((__local half8 *)&out_local[ow + 0]) = (val8_0);
|
72 |
+
}
|
73 |
+
|
74 |
+
uint iw = (OW & (~0x7));
|
75 |
+
uint ih = oh;
|
76 |
+
|
77 |
+
half8 val8_0 = 0.0f;
|
78 |
+
|
79 |
+
__local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
|
80 |
+
__local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
|
81 |
+
__local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
|
82 |
+
__local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
|
83 |
+
__local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
|
84 |
+
__local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
|
85 |
+
__local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
|
86 |
+
__local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
|
87 |
+
|
88 |
+
for (uint ic = 0; ic < IC / 8; ic++) {
|
89 |
+
val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
|
90 |
+
val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
|
91 |
+
val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
|
92 |
+
val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
|
93 |
+
val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
|
94 |
+
val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
|
95 |
+
val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
|
96 |
+
val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
|
97 |
+
}
|
98 |
+
|
99 |
+
for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
|
100 |
+
val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
|
101 |
+
}
|
102 |
+
for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
|
103 |
+
out_local[ow + 0] = (val8_0[ow % 8]);
|
104 |
+
}
|
105 |
+
|
106 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
107 |
+
|
108 |
+
event_t e2 = async_work_group_copy(
|
109 |
+
out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
|
110 |
+
out_local,
|
111 |
+
OW,
|
112 |
+
0);
|
113 |
+
wait_group_events(1, &e2);
|
114 |
+
}
|
openvino/vpu_custom_kernels/convolution1x1_hwc.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c
|
3 |
+
size 4304
|
openvino/vpu_custom_kernels/convolution1x1_hwc.cl
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void Convolution1x1_NHWC(
|
9 |
+
const __global half *in,
|
10 |
+
const __global half *out,
|
11 |
+
const __global half *w,
|
12 |
+
int IW,
|
13 |
+
int IH,
|
14 |
+
int IC,
|
15 |
+
int OW,
|
16 |
+
int OH,
|
17 |
+
int OC)
|
18 |
+
{
|
19 |
+
|
20 |
+
__local half in_local[8 * 1024];
|
21 |
+
__local half out_local[8 * 1024];
|
22 |
+
|
23 |
+
const int sizeAct = IW * IC;
|
24 |
+
|
25 |
+
event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
|
26 |
+
wait_group_events(1, &e1);
|
27 |
+
|
28 |
+
int oh = get_global_id(0);
|
29 |
+
int oc = get_global_id(1);
|
30 |
+
|
31 |
+
int stride;
|
32 |
+
int write_output = 0;
|
33 |
+
__global half *src;
|
34 |
+
|
35 |
+
__global half8 *w8 = (__global half8 *)(&w[oc * IC]);
|
36 |
+
__global half *w1 = (__global half *)(&w[oc * IC]);
|
37 |
+
|
38 |
+
for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
|
39 |
+
uint iw = ow;
|
40 |
+
uint ih = oh;
|
41 |
+
|
42 |
+
half8 val8_0 = 0.0f;
|
43 |
+
half8 val8_1 = 0.0f;
|
44 |
+
half8 val8_2 = 0.0f;
|
45 |
+
half8 val8_3 = 0.0f;
|
46 |
+
half8 val8_4 = 0.0f;
|
47 |
+
half8 val8_5 = 0.0f;
|
48 |
+
half8 val8_6 = 0.0f;
|
49 |
+
half8 val8_7 = 0.0f;
|
50 |
+
|
51 |
+
__local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
|
52 |
+
__local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
|
53 |
+
__local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
|
54 |
+
__local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
|
55 |
+
__local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
|
56 |
+
__local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
|
57 |
+
__local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
|
58 |
+
__local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
|
59 |
+
|
60 |
+
for (uint ic = 0; ic < IC / 8; ++ic) {
|
61 |
+
val8_0 += (in8_0[ic]) * (w8[ic]);
|
62 |
+
val8_1 += (in8_1[ic]) * (w8[ic]);
|
63 |
+
val8_2 += (in8_2[ic]) * (w8[ic]);
|
64 |
+
val8_3 += (in8_3[ic]) * (w8[ic]);
|
65 |
+
val8_4 += (in8_4[ic]) * (w8[ic]);
|
66 |
+
val8_5 += (in8_5[ic]) * (w8[ic]);
|
67 |
+
val8_6 += (in8_6[ic]) * (w8[ic]);
|
68 |
+
val8_7 += (in8_7[ic]) * (w8[ic]);
|
69 |
+
}
|
70 |
+
|
71 |
+
half val_0 = 0.0f;
|
72 |
+
half val_1 = 0.0f;
|
73 |
+
half val_2 = 0.0f;
|
74 |
+
half val_3 = 0.0f;
|
75 |
+
half val_4 = 0.0f;
|
76 |
+
half val_5 = 0.0f;
|
77 |
+
half val_6 = 0.0f;
|
78 |
+
half val_7 = 0.0f;
|
79 |
+
for (uint ic = IC & (~0x7); ic < IC; ++ic) {
|
80 |
+
val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
|
81 |
+
val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
|
82 |
+
val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
|
83 |
+
val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
|
84 |
+
val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
|
85 |
+
val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
|
86 |
+
val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
|
87 |
+
val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
|
88 |
+
}
|
89 |
+
out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
|
90 |
+
out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
|
91 |
+
out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
|
92 |
+
out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
|
93 |
+
out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
|
94 |
+
out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
|
95 |
+
out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
|
96 |
+
out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
|
97 |
+
}
|
98 |
+
for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
|
99 |
+
|
100 |
+
uint iw = ow;
|
101 |
+
uint ih = oh;
|
102 |
+
|
103 |
+
half8 val8 = 0.0f;
|
104 |
+
|
105 |
+
__local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
|
106 |
+
|
107 |
+
for (uint ic = 0; ic < IC / 8; ++ic) {
|
108 |
+
val8 += (in8[ic]) * (w8[ic]);
|
109 |
+
}
|
110 |
+
|
111 |
+
half val = 0.0f;
|
112 |
+
for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
|
113 |
+
val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
|
114 |
+
}
|
115 |
+
out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
|
116 |
+
}
|
117 |
+
|
118 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
119 |
+
|
120 |
+
event_t e2 = async_work_group_copy(
|
121 |
+
out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
|
122 |
+
out_local,
|
123 |
+
OW,
|
124 |
+
0);
|
125 |
+
wait_group_events(1, &e2);
|
126 |
+
}
|
openvino/vpu_custom_kernels/convolution3x3.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:021bb40840ff35506972e6f6a7dea1b5f40a8db0927aaa9a6c116b152e386851
|
3 |
+
size 5748
|
openvino/vpu_custom_kernels/convolution3x3.cl
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void Convolution3x3(
|
9 |
+
const __global half *in_param,
|
10 |
+
const __global half *out,
|
11 |
+
const __global half *w,
|
12 |
+
int IW,
|
13 |
+
int IH,
|
14 |
+
int IC,
|
15 |
+
int OW,
|
16 |
+
int OH,
|
17 |
+
int OC,
|
18 |
+
int KX,
|
19 |
+
int KY,
|
20 |
+
int stride_x,
|
21 |
+
int stride_y,
|
22 |
+
int pad_x,
|
23 |
+
int pad_y,
|
24 |
+
int dilation_x,
|
25 |
+
int dilation_y)
|
26 |
+
{
|
27 |
+
__local half in_local[8 * 1024];
|
28 |
+
__local half out_local[8 * 1024];
|
29 |
+
__local half w_local[8 * 1024];
|
30 |
+
|
31 |
+
const int sizePlane = IW * IH;
|
32 |
+
event_t e1 = async_work_group_copy_2D2D(
|
33 |
+
in_local, // dst
|
34 |
+
in_param + get_group_id(0) * stride_y * IW, // src
|
35 |
+
3 * IW, // num_elements_per_line,
|
36 |
+
IC, // num_lines,
|
37 |
+
IW * IH - 3 * IW, // src_line_stride,
|
38 |
+
0, // dst_line_stride,
|
39 |
+
0);
|
40 |
+
wait_group_events(1, &e1);
|
41 |
+
|
42 |
+
const int sizeWeight = IC * 3 * 3;
|
43 |
+
e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
|
44 |
+
wait_group_events(1, &e1);
|
45 |
+
|
46 |
+
int oh = get_global_id(0);
|
47 |
+
int oc = get_global_id(1);
|
48 |
+
|
49 |
+
__local half *in = (__local half *)in_local + 1;
|
50 |
+
|
51 |
+
int stride;
|
52 |
+
int write_output = 0;
|
53 |
+
__local half *src;
|
54 |
+
|
55 |
+
if ((stride_x == 1) && (stride_y == 1)) {
|
56 |
+
stride = OW / 8;
|
57 |
+
write_output = 1;
|
58 |
+
}
|
59 |
+
if ((stride_x == 2) && (stride_y == 2)) {
|
60 |
+
stride = OW / 4;
|
61 |
+
write_output = 2;
|
62 |
+
}
|
63 |
+
|
64 |
+
for (int ow = 0; ow < stride; ow++) {
|
65 |
+
float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
66 |
+
for (int ic = 0; ic < IC; ++ic) {
|
67 |
+
src = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
|
68 |
+
__local half *k = (__local half *)(w_local + ic * 3 * 3);
|
69 |
+
|
70 |
+
half8 aux_in00 = *((__local half8 *)src - 1);
|
71 |
+
half8 aux_in01 = *((__local half8 *)src + 0);
|
72 |
+
half8 aux_in02 = *((__local half8 *)src + 1);
|
73 |
+
half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
|
74 |
+
half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
|
75 |
+
half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
|
76 |
+
half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
|
77 |
+
half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
|
78 |
+
half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);
|
79 |
+
|
80 |
+
short8 in00 = *((short8 *)&aux_in00);
|
81 |
+
short8 in01 = *((short8 *)&aux_in01);
|
82 |
+
short8 in02 = *((short8 *)&aux_in02);
|
83 |
+
short8 in10 = *((short8 *)&aux_in10);
|
84 |
+
short8 in11 = *((short8 *)&aux_in11);
|
85 |
+
short8 in12 = *((short8 *)&aux_in12);
|
86 |
+
short8 in20 = *((short8 *)&aux_in20);
|
87 |
+
short8 in21 = *((short8 *)&aux_in21);
|
88 |
+
short8 in22 = *((short8 *)&aux_in22);
|
89 |
+
|
90 |
+
short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
|
91 |
+
short8 aux_aux01 = in01;
|
92 |
+
short8 aux_aux02 = __builtin_shave_cmu_alignvec_rri_short8(in01, in02, 2);
|
93 |
+
short8 aux_aux10 = __builtin_shave_cmu_alignvec_rri_short8(in10, in11, 14);
|
94 |
+
short8 aux_aux11 = in11;
|
95 |
+
short8 aux_aux12 = __builtin_shave_cmu_alignvec_rri_short8(in11, in12, 2);
|
96 |
+
short8 aux_aux20 = __builtin_shave_cmu_alignvec_rri_short8(in20, in21, 14);
|
97 |
+
short8 aux_aux21 = in21;
|
98 |
+
short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);
|
99 |
+
|
100 |
+
half8 aux00 = *((half8 *)&aux_aux00);
|
101 |
+
half8 aux01 = *((half8 *)&aux_aux01);
|
102 |
+
half8 aux02 = *((half8 *)&aux_aux02);
|
103 |
+
half8 aux10 = *((half8 *)&aux_aux10);
|
104 |
+
half8 aux11 = *((half8 *)&aux_aux11);
|
105 |
+
half8 aux12 = *((half8 *)&aux_aux12);
|
106 |
+
half8 aux20 = *((half8 *)&aux_aux20);
|
107 |
+
half8 aux21 = *((half8 *)&aux_aux21);
|
108 |
+
half8 aux22 = *((half8 *)&aux_aux22);
|
109 |
+
|
110 |
+
half8 w00 = (half8)(*(k + 0));
|
111 |
+
half8 w01 = (half8)(*(k + 1));
|
112 |
+
half8 w02 = (half8)(*(k + 2));
|
113 |
+
half8 w10 = (half8)(*(k + 3));
|
114 |
+
half8 w11 = (half8)(*(k + 4));
|
115 |
+
half8 w12 = (half8)(*(k + 5));
|
116 |
+
half8 w20 = (half8)(*(k + 6));
|
117 |
+
half8 w21 = (half8)(*(k + 7));
|
118 |
+
half8 w22 = (half8)(*(k + 8));
|
119 |
+
|
120 |
+
val += convert_float8(aux00) * convert_float8(w00);
|
121 |
+
val += convert_float8(aux01) * convert_float8(w01);
|
122 |
+
val += convert_float8(aux02) * convert_float8(w02);
|
123 |
+
val += convert_float8(aux10) * convert_float8(w10);
|
124 |
+
val += convert_float8(aux11) * convert_float8(w11);
|
125 |
+
val += convert_float8(aux12) * convert_float8(w12);
|
126 |
+
val += convert_float8(aux20) * convert_float8(w20);
|
127 |
+
val += convert_float8(aux21) * convert_float8(w21);
|
128 |
+
val += convert_float8(aux22) * convert_float8(w22);
|
129 |
+
}
|
130 |
+
if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
|
131 |
+
if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
|
132 |
+
}
|
133 |
+
|
134 |
+
for (int ow = OW & ~(0x7); ow < OW; ow++) {
|
135 |
+
float val = 0.0f;
|
136 |
+
for (int ic = 0; ic < IC; ++ic) {
|
137 |
+
for (int ky = 0; ky < 3; ++ky) {
|
138 |
+
for (int kx = 0; kx < 3; ++kx) {
|
139 |
+
int iw = ow * stride_x - pad_x + kx * dilation_x;
|
140 |
+
int ih = oh * stride_y - pad_y + ky * dilation_y;
|
141 |
+
|
142 |
+
val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
|
143 |
+
* convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
|
144 |
+
}
|
145 |
+
}
|
146 |
+
}
|
147 |
+
out_local[ow] = convert_half(val);
|
148 |
+
}
|
149 |
+
|
150 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
151 |
+
|
152 |
+
event_t e2 = async_work_group_copy(
|
153 |
+
out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
|
154 |
+
out_local,
|
155 |
+
OW,
|
156 |
+
0);
|
157 |
+
wait_group_events(1, &e2);
|
158 |
+
}
|
openvino/vpu_custom_kernels/correlate.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2b24b1b5bfd1786128682ee814230653b4b63aad5b472feec9c6f4a4c833e2f
|
3 |
+
size 14336
|
openvino/vpu_custom_kernels/correlate.cl
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
|
7 |
+
#define MAX_OPENCL_BUFF_SIZE 64 * 1024
|
8 |
+
|
9 |
+
#define USE_DMA 1
|
10 |
+
|
11 |
+
#if defined(USE_DMA)
|
12 |
+
void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride)
|
13 |
+
{
|
14 |
+
item_dma_event_t copyEvent =
|
15 |
+
WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
|
16 |
+
WaitWorkItemDmaEvents(1, ©Event);
|
17 |
+
}
|
18 |
+
|
19 |
+
void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride)
|
20 |
+
{
|
21 |
+
item_dma_event_t copyEvent =
|
22 |
+
WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
|
23 |
+
WaitWorkItemDmaEvents(1, ©Event);
|
24 |
+
}
|
25 |
+
#endif
|
26 |
+
|
27 |
+
void memzero(void *ptr, size_t num)
|
28 |
+
{
|
29 |
+
float4 *line0_ = (float4 *)ptr;
|
30 |
+
#pragma unroll 16
|
31 |
+
for (int i = 0; i < num / 16; i++) {
|
32 |
+
line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
|
33 |
+
}
|
34 |
+
uchar *ptr_ = (uchar *)ptr;
|
35 |
+
for (int i = num / 16 * 16; i < num; i++) {
|
36 |
+
ptr_[i] = 0;
|
37 |
+
}
|
38 |
+
}
|
39 |
+
|
40 |
+
void __attribute__((noinline)) crosscorrh(
|
41 |
+
__private const half *restrict line0,
|
42 |
+
__private const half *restrict line1,
|
43 |
+
__private half *restrict dline,
|
44 |
+
int topwidth,
|
45 |
+
int max_displacement,
|
46 |
+
int neighborhood_grid_radius,
|
47 |
+
int kernel_size,
|
48 |
+
int padding,
|
49 |
+
int bottomwidth,
|
50 |
+
int stride1,
|
51 |
+
int stride2,
|
52 |
+
int max_channels,
|
53 |
+
int cur_subchannels)
|
54 |
+
{
|
55 |
+
if (max_channels == 64) {
|
56 |
+
for (int i = 0; i < kernel_size; i++) {
|
57 |
+
int x1 = max_displacement - padding + i;
|
58 |
+
int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
|
59 |
+
x1 += offset1 * stride1;
|
60 |
+
|
61 |
+
for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
|
62 |
+
int x2 = x1 - neighborhood_grid_radius * stride2;
|
63 |
+
int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
|
64 |
+
x2 += offset2 * stride2;
|
65 |
+
|
66 |
+
for (int top_channel_x = offset2 - neighborhood_grid_radius;
|
67 |
+
top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
|
68 |
+
top_channel_x++, x2 += stride2) {
|
69 |
+
half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
70 |
+
|
71 |
+
half8 *src0 = (half8 *)(line0 + x1 * max_channels);
|
72 |
+
half8 *src1 = (half8 *)(line1 + x2 * max_channels);
|
73 |
+
|
74 |
+
#pragma unroll 8
|
75 |
+
for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);
|
76 |
+
|
77 |
+
half sum = __builtin_shave_sau_sumx_f16_r(sum4);
|
78 |
+
dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
|
79 |
+
}
|
80 |
+
}
|
81 |
+
}
|
82 |
+
} else {
|
83 |
+
int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;
|
84 |
+
|
85 |
+
for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
|
86 |
+
for (int i = 0; i < kernel_size; i++) {
|
87 |
+
int x1 = blockIdx_x * stride1 + max_displacement + i - padding;
|
88 |
+
|
89 |
+
if ((x1 >= 0) && (x1 < bottomwidth)) {
|
90 |
+
int o_min = -neighborhood_grid_radius * stride2;
|
91 |
+
int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
|
92 |
+
if ((o_min) < (-x1)) {
|
93 |
+
o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
|
94 |
+
}
|
95 |
+
if ((o_max) >= (bottomwidth + stride2 - x1)) {
|
96 |
+
o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
|
97 |
+
}
|
98 |
+
|
99 |
+
int o = o_min;
|
100 |
+
for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
|
101 |
+
half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
|
102 |
+
half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels);
|
103 |
+
half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels);
|
104 |
+
half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels);
|
105 |
+
half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels);
|
106 |
+
|
107 |
+
int c = 0;
|
108 |
+
|
109 |
+
half8 sum40 = 0;
|
110 |
+
half8 sum41 = 0;
|
111 |
+
half8 sum42 = 0;
|
112 |
+
half8 sum43 = 0;
|
113 |
+
|
114 |
+
for (; c <= cur_subchannels / 8 - 4; c += 4) {
|
115 |
+
sum40 += bottom0[c + 0] * bottom1_0[c + 0];
|
116 |
+
sum40 += bottom0[c + 1] * bottom1_0[c + 1];
|
117 |
+
sum40 += bottom0[c + 2] * bottom1_0[c + 2];
|
118 |
+
sum40 += bottom0[c + 3] * bottom1_0[c + 3];
|
119 |
+
|
120 |
+
sum41 += bottom0[c + 0] * bottom1_1[c + 0];
|
121 |
+
sum41 += bottom0[c + 1] * bottom1_1[c + 1];
|
122 |
+
sum41 += bottom0[c + 2] * bottom1_1[c + 2];
|
123 |
+
sum41 += bottom0[c + 3] * bottom1_1[c + 3];
|
124 |
+
|
125 |
+
sum42 += bottom0[c + 0] * bottom1_2[c + 0];
|
126 |
+
sum42 += bottom0[c + 1] * bottom1_2[c + 1];
|
127 |
+
sum42 += bottom0[c + 2] * bottom1_2[c + 2];
|
128 |
+
sum42 += bottom0[c + 3] * bottom1_2[c + 3];
|
129 |
+
|
130 |
+
sum43 += bottom0[c + 0] * bottom1_3[c + 0];
|
131 |
+
sum43 += bottom0[c + 1] * bottom1_3[c + 1];
|
132 |
+
sum43 += bottom0[c + 2] * bottom1_3[c + 2];
|
133 |
+
sum43 += bottom0[c + 3] * bottom1_3[c + 3];
|
134 |
+
}
|
135 |
+
|
136 |
+
for (; c < cur_subchannels / 8; c++) {
|
137 |
+
sum40 += bottom0[c] * bottom1_0[c];
|
138 |
+
sum41 += bottom0[c] * bottom1_1[c];
|
139 |
+
sum42 += bottom0[c] * bottom1_2[c];
|
140 |
+
sum43 += bottom0[c] * bottom1_3[c];
|
141 |
+
}
|
142 |
+
|
143 |
+
half sum0 = __builtin_shave_sau_sumx_f16_r(sum40);
|
144 |
+
half sum1 = __builtin_shave_sau_sumx_f16_r(sum41);
|
145 |
+
half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
|
146 |
+
half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);
|
147 |
+
|
148 |
+
for (c = c * 8; c < cur_subchannels; c++) {
|
149 |
+
sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
|
150 |
+
sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
|
151 |
+
sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
|
152 |
+
sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
|
153 |
+
}
|
154 |
+
|
155 |
+
dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
156 |
+
sum0;
|
157 |
+
dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
158 |
+
sum1;
|
159 |
+
dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
160 |
+
sum2;
|
161 |
+
dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
162 |
+
sum3;
|
163 |
+
}
|
164 |
+
|
165 |
+
for (; o < o_max; o += 1 * stride2) {
|
166 |
+
half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
|
167 |
+
half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels);
|
168 |
+
|
169 |
+
int c = 0;
|
170 |
+
|
171 |
+
half8 sum4 = 0;
|
172 |
+
for (; c <= cur_subchannels / 8 - 4; c += 4) {
|
173 |
+
sum4 += bottom0[c + 0] * bottom1[c + 0];
|
174 |
+
sum4 += bottom0[c + 1] * bottom1[c + 1];
|
175 |
+
sum4 += bottom0[c + 2] * bottom1[c + 2];
|
176 |
+
sum4 += bottom0[c + 3] * bottom1[c + 3];
|
177 |
+
}
|
178 |
+
for (; c < cur_subchannels / 8; c++) {
|
179 |
+
sum4 += bottom0[c] * bottom1[c];
|
180 |
+
}
|
181 |
+
|
182 |
+
half sum = __builtin_shave_sau_sumx_f16_r(sum4);
|
183 |
+
|
184 |
+
for (c = c * 8; c < cur_subchannels; c++) {
|
185 |
+
sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
|
186 |
+
}
|
187 |
+
|
188 |
+
dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
|
189 |
+
}
|
190 |
+
}
|
191 |
+
}
|
192 |
+
}
|
193 |
+
}
|
194 |
+
}
|
195 |
+
|
196 |
+
__kernel void correlate2_half(
|
197 |
+
__global const half *restrict bottom0,
|
198 |
+
__global const half *restrict bottom1,
|
199 |
+
__global half *restrict top,
|
200 |
+
int topwidth,
|
201 |
+
int topheight,
|
202 |
+
int bottomwidth,
|
203 |
+
int bottomheight,
|
204 |
+
int bottomchannels,
|
205 |
+
int max_displacement,
|
206 |
+
int padding,
|
207 |
+
int neighborhood_grid_radius,
|
208 |
+
int neighborhood_grid_width,
|
209 |
+
int kernel_size,
|
210 |
+
int stride1,
|
211 |
+
int stride2)
|
212 |
+
{
|
213 |
+
int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
|
214 |
+
if (max_channels > 64) max_channels = 64;
|
215 |
+
int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
|
216 |
+
int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count;
|
217 |
+
if (subchannels < max_channels) subchannels = max_channels;
|
218 |
+
|
219 |
+
const int sumelems = kernel_size * kernel_size * bottomchannels;
|
220 |
+
|
221 |
+
__private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];
|
222 |
+
|
223 |
+
__private half *line0 = cmx;
|
224 |
+
__private half *line1 = line0 + bottomwidth * subchannels;
|
225 |
+
__private half *dline = line1 + bottomwidth * subchannels;
|
226 |
+
|
227 |
+
int blockIdx_y = get_global_id(0);
|
228 |
+
|
229 |
+
#if defined(USE_DMA)
|
230 |
+
__private half *dmabuf = dline + topwidth * neighborhood_grid_width;
|
231 |
+
#endif
|
232 |
+
|
233 |
+
int y1 = blockIdx_y * stride1 + max_displacement;
|
234 |
+
|
235 |
+
for (int j = 0; j < kernel_size; j++) {
|
236 |
+
for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
|
237 |
+
// configure channel batching
|
238 |
+
int startchannel = bottomchannel;
|
239 |
+
int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
|
240 |
+
int deltachannels = endchannel - startchannel;
|
241 |
+
|
242 |
+
// load line form blob 0 with repackaging
|
243 |
+
if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
|
244 |
+
#if defined(USE_DMA)
|
245 |
+
__global const half *curr =
|
246 |
+
bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
|
247 |
+
dmacpyLineSrcStrideStart(
|
248 |
+
curr,
|
249 |
+
dmabuf,
|
250 |
+
bottomwidth * deltachannels * sizeof(half),
|
251 |
+
bottomwidth * sizeof(half),
|
252 |
+
bottomwidth * bottomheight * sizeof(half));
|
253 |
+
|
254 |
+
for (int ch = 0; ch < deltachannels; ch++) {
|
255 |
+
for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
|
256 |
+
half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
|
257 |
+
line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
|
258 |
+
line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
|
259 |
+
line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
|
260 |
+
line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
|
261 |
+
|
262 |
+
line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
|
263 |
+
line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
|
264 |
+
line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
|
265 |
+
line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
|
266 |
+
}
|
267 |
+
|
268 |
+
for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
|
269 |
+
line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
|
270 |
+
}
|
271 |
+
}
|
272 |
+
|
273 |
+
if (deltachannels < subchannels)
|
274 |
+
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
|
275 |
+
memzero(
|
276 |
+
line0 + blockIdx_x * max_channels + deltachannels,
|
277 |
+
(subchannels - deltachannels) * sizeof(half));
|
278 |
+
#else
|
279 |
+
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
|
280 |
+
for (int ch = 0; ch < deltachannels; ch++)
|
281 |
+
line0[blockIdx_x * max_channels + ch] = bottom0
|
282 |
+
[(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
|
283 |
+
+ blockIdx_x];
|
284 |
+
|
285 |
+
if (deltachannels < subchannels)
|
286 |
+
memzero(
|
287 |
+
line0 + blockIdx_x * max_channels + deltachannels,
|
288 |
+
(subchannels - deltachannels) * sizeof(half));
|
289 |
+
}
|
290 |
+
#endif
|
291 |
+
} else
|
292 |
+
memzero(line0, max_channels * bottomwidth * sizeof(half));
|
293 |
+
|
294 |
+
for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
|
295 |
+
int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;
|
296 |
+
|
297 |
+
if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
|
298 |
+
#if defined(USE_DMA)
|
299 |
+
__global const half *curr =
|
300 |
+
bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
|
301 |
+
dmacpyLineSrcStrideStart(
|
302 |
+
curr,
|
303 |
+
dmabuf,
|
304 |
+
bottomwidth * deltachannels * sizeof(half),
|
305 |
+
bottomwidth * sizeof(half),
|
306 |
+
bottomwidth * bottomheight * sizeof(half));
|
307 |
+
|
308 |
+
for (int ch = 0; ch < deltachannels; ch++) {
|
309 |
+
for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
|
310 |
+
half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
|
311 |
+
line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
|
312 |
+
line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
|
313 |
+
line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
|
314 |
+
line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
|
315 |
+
|
316 |
+
line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
|
317 |
+
line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
|
318 |
+
line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
|
319 |
+
line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
|
320 |
+
}
|
321 |
+
|
322 |
+
for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
|
323 |
+
line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
|
324 |
+
}
|
325 |
+
}
|
326 |
+
#else
|
327 |
+
for (int ch = 0; ch < deltachannels; ch++) {
|
328 |
+
for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
|
329 |
+
half8 val = ((
|
330 |
+
__global half8
|
331 |
+
*)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
|
332 |
+
[blockIdx_x];
|
333 |
+
line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
|
334 |
+
line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
|
335 |
+
line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
|
336 |
+
line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
|
337 |
+
|
338 |
+
line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
|
339 |
+
line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
|
340 |
+
line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
|
341 |
+
line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
|
342 |
+
}
|
343 |
+
for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
|
344 |
+
half val =
|
345 |
+
(bottom1 + (ch + startchannel) * bottomheight * bottomwidth
|
346 |
+
+ (y2 + j - padding) * bottomwidth)[blockIdx_x];
|
347 |
+
line1[(blockIdx_x)*max_channels + ch] = val;
|
348 |
+
}
|
349 |
+
}
|
350 |
+
#endif
|
351 |
+
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
|
352 |
+
if (deltachannels < subchannels)
|
353 |
+
memzero(
|
354 |
+
line1 + blockIdx_x * max_channels + deltachannels,
|
355 |
+
(subchannels - deltachannels) * sizeof(half));
|
356 |
+
}
|
357 |
+
} else
|
358 |
+
memzero(line1, max_channels * bottomwidth * sizeof(half));
|
359 |
+
|
360 |
+
if (j == 0 && startchannel == 0) {
|
361 |
+
memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
|
362 |
+
} else {
|
363 |
+
#if defined(USE_DMA)
|
364 |
+
dmacpyLineSrcStrideStart(
|
365 |
+
top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
|
366 |
+
dline,
|
367 |
+
topwidth * neighborhood_grid_width * sizeof(half),
|
368 |
+
topwidth * sizeof(half),
|
369 |
+
topwidth * topheight * sizeof(half));
|
370 |
+
#else
|
371 |
+
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
|
372 |
+
for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
|
373 |
+
half8 val = ((
|
374 |
+
__global half8
|
375 |
+
*)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
|
376 |
+
[blockIdx_x];
|
377 |
+
((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val;
|
378 |
+
}
|
379 |
+
for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
|
380 |
+
dline[top_channel_x * topwidth + blockIdx_x] =
|
381 |
+
top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
|
382 |
+
+ blockIdx_y * topwidth + blockIdx_x];
|
383 |
+
}
|
384 |
+
}
|
385 |
+
#endif
|
386 |
+
}
|
387 |
+
|
388 |
+
if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
|
389 |
+
&& y2 + j - padding < bottomheight) {
|
390 |
+
crosscorrh(
|
391 |
+
line0,
|
392 |
+
line1,
|
393 |
+
dline,
|
394 |
+
topwidth,
|
395 |
+
max_displacement,
|
396 |
+
neighborhood_grid_radius,
|
397 |
+
kernel_size,
|
398 |
+
padding,
|
399 |
+
bottomwidth,
|
400 |
+
stride1,
|
401 |
+
stride2,
|
402 |
+
max_channels,
|
403 |
+
subchannels);
|
404 |
+
}
|
405 |
+
|
406 |
+
if (j == kernel_size - 1 && endchannel == bottomchannels) {
|
407 |
+
half8 scale = (half8){
|
408 |
+
(half)sumelems,
|
409 |
+
(half)sumelems,
|
410 |
+
(half)sumelems,
|
411 |
+
(half)sumelems,
|
412 |
+
(half)sumelems,
|
413 |
+
(half)sumelems,
|
414 |
+
(half)sumelems,
|
415 |
+
(half)sumelems};
|
416 |
+
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
|
417 |
+
for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
|
418 |
+
((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] =
|
419 |
+
((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale;
|
420 |
+
}
|
421 |
+
for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
|
422 |
+
dline[top_channel_x * topwidth + blockIdx_x] =
|
423 |
+
dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
|
424 |
+
}
|
425 |
+
}
|
426 |
+
}
|
427 |
+
|
428 |
+
#if defined(USE_DMA)
|
429 |
+
dmacpyLineDstStrideStart(
|
430 |
+
dline,
|
431 |
+
top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
|
432 |
+
topwidth * neighborhood_grid_width * sizeof(half),
|
433 |
+
topwidth * sizeof(half),
|
434 |
+
topwidth * topheight * sizeof(half));
|
435 |
+
#else
|
436 |
+
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
|
437 |
+
for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
|
438 |
+
((__global half8
|
439 |
+
*)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
|
440 |
+
[blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x]
|
441 |
+
+ (half8){0, 0, 0, 0, 0, 0, 0, 0};
|
442 |
+
}
|
443 |
+
for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
|
444 |
+
top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
|
445 |
+
+ blockIdx_y * topwidth + blockIdx_x] =
|
446 |
+
dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
|
447 |
+
}
|
448 |
+
}
|
449 |
+
#endif
|
450 |
+
}
|
451 |
+
}
|
452 |
+
}
|
453 |
+
}
|
openvino/vpu_custom_kernels/ctc.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:292de0fbb8dc6ead6970576d1b9a26a323fc9febfceb92c3af6b84496d523def
|
3 |
+
size 10196
|
openvino/vpu_custom_kernels/ctc.cl
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__global half *find(__global const half *begin, __global const half *end, half value)
|
9 |
+
{
|
10 |
+
while (begin != end) {
|
11 |
+
if (*begin == value) {
|
12 |
+
return begin;
|
13 |
+
}
|
14 |
+
++begin;
|
15 |
+
}
|
16 |
+
return end;
|
17 |
+
}
|
18 |
+
|
19 |
+
__kernel void CTCDecoder(
|
20 |
+
__global half *restrict probabilities,
|
21 |
+
__global half *restrict sequence_indicators,
|
22 |
+
__global half *restrict output,
|
23 |
+
int width,
|
24 |
+
int height,
|
25 |
+
int channels)
|
26 |
+
{
|
27 |
+
__local half local_src[88 * 1 * 77];
|
28 |
+
__local half local_dst[88 * 1];
|
29 |
+
|
30 |
+
event_t e1 = async_work_group_copy_2D2D(
|
31 |
+
local_src, // dst
|
32 |
+
probabilities, // src
|
33 |
+
width, // num_elements_per_line,
|
34 |
+
height * channels, // num_lines,
|
35 |
+
width * (height - 1), // src_line_stride,
|
36 |
+
width * (height - 1), // dst_line_stride,
|
37 |
+
0);
|
38 |
+
|
39 |
+
wait_group_events(1, &e1);
|
40 |
+
|
41 |
+
const int T = channels; // Time
|
42 |
+
const int B = height; // Batches
|
43 |
+
const int C = width; // Chars
|
44 |
+
|
45 |
+
#pragma unroll 4
|
46 |
+
for (int i = 0; i < B * T; i++) {
|
47 |
+
local_dst[i] = -1.h;
|
48 |
+
}
|
49 |
+
|
50 |
+
int output_index = 0;
|
51 |
+
|
52 |
+
for (int b = 0; b < B; ++b) {
|
53 |
+
__global const half *restrict seq_ind = sequence_indicators + b * T;
|
54 |
+
const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
|
55 |
+
const int time = min(seq_len, T);
|
56 |
+
|
57 |
+
int prev_class_idx = -1;
|
58 |
+
|
59 |
+
#pragma unroll 4
|
60 |
+
for (int t = 0; t < time; ++t) {
|
61 |
+
__local const half *restrict probs = local_src + b * C + t * C * B;
|
62 |
+
|
63 |
+
int max_class_idx = 0;
|
64 |
+
half max_prob = probs[0];
|
65 |
+
for (int c = 1; c < C; ++c) {
|
66 |
+
const half prob = probs[c];
|
67 |
+
if (prob > max_prob) {
|
68 |
+
max_class_idx = c;
|
69 |
+
max_prob = prob;
|
70 |
+
}
|
71 |
+
}
|
72 |
+
|
73 |
+
if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) {
|
74 |
+
local_dst[b * T + output_index] = (half)max_class_idx;
|
75 |
+
output_index++;
|
76 |
+
}
|
77 |
+
|
78 |
+
prev_class_idx = max_class_idx;
|
79 |
+
}
|
80 |
+
}
|
81 |
+
|
82 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
83 |
+
|
84 |
+
event_t e2 = async_work_group_copy_2D2D(
|
85 |
+
output, // dst
|
86 |
+
local_dst, // src
|
87 |
+
channels, // num_elements_per_line,
|
88 |
+
height, // num_lines,
|
89 |
+
0, // src_line_stride,
|
90 |
+
0, // dst_line_stride,
|
91 |
+
0);
|
92 |
+
|
93 |
+
wait_group_events(1, &e2);
|
94 |
+
}
|
openvino/vpu_custom_kernels/customLayerBindings.xml
ADDED
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<CustomLayer name="ReorgYolo" type="MVCL" version="1">
|
2 |
+
<Kernel entry="reorg_hwc_naive">
|
3 |
+
<Source filename="reorg_hwc_naive.bin"/>
|
4 |
+
<Parameters>
|
5 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
|
6 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
|
7 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
8 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
9 |
+
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
10 |
+
<Scalar arg-name="stride" type="int" source="stride"/>
|
11 |
+
</Parameters>
|
12 |
+
<WorkSizes dim="input,0" global="F,1,1" local="stride*stride,1,1"/>
|
13 |
+
</Kernel>
|
14 |
+
</CustomLayer>
|
15 |
+
|
16 |
+
<CustomLayer name="ReorgYolo" type="MVCL" version="1">
|
17 |
+
<Kernel entry="reorg_chw">
|
18 |
+
<Source filename="reorg_chw.bin"/>
|
19 |
+
<Parameters>
|
20 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
21 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
22 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
23 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
24 |
+
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
25 |
+
<Scalar arg-name="stride" type="int" source="stride"/>
|
26 |
+
</Parameters>
|
27 |
+
<WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="stride,stride,1"/>
|
28 |
+
</Kernel>
|
29 |
+
</CustomLayer>
|
30 |
+
|
31 |
+
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
32 |
+
<Where do_softmax="1"/>
|
33 |
+
<Kernel entry="region_chw">
|
34 |
+
<Source filename="region_chw.bin"/>
|
35 |
+
<Parameters>
|
36 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
37 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
38 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
39 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
40 |
+
<Scalar arg-name="classes" type="int" source="classes"/>
|
41 |
+
<Scalar arg-name="coords" type="int" source="coords"/>
|
42 |
+
<Scalar arg-name="num" type="int" source="num"/>
|
43 |
+
<Scalar arg-name="maskSize" type="int" source="3"/>
|
44 |
+
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
45 |
+
</Parameters>
|
46 |
+
<WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
47 |
+
</Kernel>
|
48 |
+
</CustomLayer>
|
49 |
+
|
50 |
+
<CustomLayer name="RegionYolo" type="MVCL" version="1">-->
|
51 |
+
<Where do_softmax="0" mask="0,1,2"/>
|
52 |
+
<Kernel entry="region_chw">
|
53 |
+
<Source filename="region_chw.bin"/>
|
54 |
+
<Parameters>
|
55 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
56 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
57 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
58 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
59 |
+
<Scalar arg-name="classes" type="int" source="classes"/>
|
60 |
+
<Scalar arg-name="coords" type="int" source="coords"/>
|
61 |
+
<Scalar arg-name="num" type="int" source="num"/>
|
62 |
+
<Scalar arg-name="maskSize" type="int" source="3"/>
|
63 |
+
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
64 |
+
</Parameters>
|
65 |
+
<WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
66 |
+
</Kernel>
|
67 |
+
</CustomLayer>
|
68 |
+
|
69 |
+
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
70 |
+
<Where do_softmax="1"/>
|
71 |
+
<Kernel entry="region_hwc">
|
72 |
+
<Source filename="region_hwc.bin"/>
|
73 |
+
<Parameters>
|
74 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
|
75 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
|
76 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
77 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
78 |
+
<Scalar arg-name="classes" type="int" source="classes"/>
|
79 |
+
<Scalar arg-name="coords" type="int" source="coords"/>
|
80 |
+
<Scalar arg-name="num" type="int" source="num"/>
|
81 |
+
<Scalar arg-name="maskSize" type="int" source="3"/>
|
82 |
+
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
83 |
+
</Parameters>
|
84 |
+
<WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
85 |
+
</Kernel>
|
86 |
+
</CustomLayer>
|
87 |
+
|
88 |
+
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
89 |
+
<Where do_softmax="0" mask="0,1,2"/>
|
90 |
+
<Kernel entry="region_hwc">
|
91 |
+
<Source filename="region_hwc.bin"/>
|
92 |
+
<Parameters>
|
93 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
|
94 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
|
95 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
96 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
97 |
+
<Scalar arg-name="classes" type="int" source="classes"/>
|
98 |
+
<Scalar arg-name="coords" type="int" source="coords"/>
|
99 |
+
<Scalar arg-name="num" type="int" source="num"/>
|
100 |
+
<Scalar arg-name="maskSize" type="int" source="3"/>
|
101 |
+
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
102 |
+
</Parameters>
|
103 |
+
<WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
104 |
+
</Kernel>
|
105 |
+
</CustomLayer>
|
106 |
+
|
107 |
+
<!-- Pixel-wise kernel binding, local work group config is per line in the input tensor -->
|
108 |
+
<CustomLayer name="GRN" type="MVCL" version="1">
|
109 |
+
<Kernel entry="grn">
|
110 |
+
<Source filename="grn.bin"/>
|
111 |
+
<Parameters>
|
112 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
113 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
114 |
+
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
115 |
+
<Scalar arg-name="bias" type="float" source="bias"/>
|
116 |
+
</Parameters>
|
117 |
+
<WorkSizes dim="input,0" global="X,Y,1" local="X,1,1"/>
|
118 |
+
</Kernel>
|
119 |
+
</CustomLayer>
|
120 |
+
|
121 |
+
<!-- Two stage layer binding, first kernel computes mean and variance, the second one normalizes input tensor-->
|
122 |
+
<CustomLayer name="MVN" type="MVCL" version="1">
|
123 |
+
<Kernel entry="reduction_mean" stage="0">
|
124 |
+
<Source filename="mvn_reduction.bin"/>
|
125 |
+
<Parameters>
|
126 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
127 |
+
<Tensor arg-name="mean" type="output_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
|
128 |
+
<Tensor arg-name="variance" type="output_buffer" port-index="1" dim="output,0" size="Y*F*4"/>
|
129 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
130 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
131 |
+
<Scalar arg-name="across_channels" type="int" source="across_channels"/>
|
132 |
+
</Parameters>
|
133 |
+
<WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
|
134 |
+
</Kernel>
|
135 |
+
<Kernel entry="mvn_scale" stage="1">
|
136 |
+
<Source filename="mvn_scale.bin"/>
|
137 |
+
<Parameters>
|
138 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
139 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
140 |
+
<Tensor arg-name="mean_part" type="input_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
|
141 |
+
<Tensor arg-name="power_mean" type="input_buffer" port-index="1" dim="output,0" size="Y*F*4"/>
|
142 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
143 |
+
<Scalar arg-name="H1" type="int" port-index="0" source="I.Y"/>
|
144 |
+
<Scalar arg-name="across_channels" type="int" source="across_channels"/>
|
145 |
+
<Scalar arg-name="normalize_variance" type="int" source="normalize_variance"/>
|
146 |
+
<Scalar arg-name="nparts" type="int" port-index="0" source="I.Y"/>
|
147 |
+
</Parameters>
|
148 |
+
<WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
|
149 |
+
</Kernel>
|
150 |
+
</CustomLayer>
|
151 |
+
|
152 |
+
<!-- Single work group kernel for not embarrassingly-parallel use-case -->
|
153 |
+
<CustomLayer name="CTCGreedyDecoder" type="MVCL" version="1" max-shaves="1">
|
154 |
+
<Kernel entry="CTCDecoder">
|
155 |
+
<Source filename="ctc.bin"/>
|
156 |
+
<Parameters>
|
157 |
+
<Tensor arg-name="probabilities" type="input" port-index="0" format="FYX"/>
|
158 |
+
<Tensor arg-name="sequence_indicators" type="input" port-index="1" format="BF"/>
|
159 |
+
<Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
|
160 |
+
<Scalar arg-name="width" type="int" port-index="0" source="I.X"/>
|
161 |
+
<Scalar arg-name="height" type="int" port-index="0" source="I.Y"/>
|
162 |
+
<Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
|
163 |
+
</Parameters>
|
164 |
+
<WorkSizes dim="output,0" global="1,1,1" local="1,1,1"/>
|
165 |
+
</Kernel>
|
166 |
+
</CustomLayer>
|
167 |
+
|
168 |
+
<CustomLayer name="ShuffleChannel" type="MVCL" version="1">
|
169 |
+
<!-- artificially added where closure for testing reasons, kernel itself supports arbitrary grouping -->
|
170 |
+
<!-- <Where group="2"/> -->
|
171 |
+
<Kernel entry="ShuffleChannel">
|
172 |
+
<Source filename="shuffle_channels.bin"/>
|
173 |
+
<Parameters>
|
174 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
175 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
176 |
+
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
177 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
178 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
179 |
+
<Scalar arg-name="G" type="int" source="group"/>
|
180 |
+
</Parameters>
|
181 |
+
<WorkSizes dim="input,0" global="F,1,1" local="1,1,1"/>
|
182 |
+
</Kernel>
|
183 |
+
</CustomLayer>
|
184 |
+
|
185 |
+
<!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
|
186 |
+
<CustomLayer name="FakeQuantize" type="MVCL" version="1">
|
187 |
+
<Kernel entry="quantize">
|
188 |
+
<Source filename="fakequantize.bin"/>
|
189 |
+
<Parameters>
|
190 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
191 |
+
<Tensor arg-name="input_low" type="input" port-index="1" format="ANY"/>
|
192 |
+
<Tensor arg-name="input_high" type="input" port-index="2" format="ANY"/>
|
193 |
+
<Tensor arg-name="output_low" type="input" port-index="3" format="ANY"/>
|
194 |
+
<Tensor arg-name="output_high" type="input" port-index="4" format="ANY"/>
|
195 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
196 |
+
<Scalar arg-name="levels" type="int" source="levels"/>
|
197 |
+
<Scalar arg-name="input_low_size" type="int" port-index="1" source="I.F"/>
|
198 |
+
<Scalar arg-name="input_high_size" type="int" port-index="2" source="I.F"/>
|
199 |
+
<Scalar arg-name="output_low_size" type="int" port-index="3" source="I.F"/>
|
200 |
+
<Scalar arg-name="output_high_size" type="int" port-index="4" source="I.F"/>
|
201 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
202 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
203 |
+
</Parameters>
|
204 |
+
<WorkSizes dim="input,0" global="1,Y,F" local="1,Y,1"/>
|
205 |
+
</Kernel>
|
206 |
+
</CustomLayer>
|
207 |
+
|
208 |
+
<!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
|
209 |
+
<CustomLayer name="FakeQuantizeBin" type="MVCL" version="1">
|
210 |
+
<Where levels="2"/>
|
211 |
+
<Kernel entry="binarization">
|
212 |
+
<Source filename="binarization.bin"/>
|
213 |
+
<Parameters>
|
214 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
215 |
+
<Tensor arg-name="input_low_high" type="input" port-index="1" format="BFYX"/>
|
216 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
217 |
+
<Scalar arg-name="switch_out" type="int" source="switch_out"/>
|
218 |
+
<Scalar arg-name="input_low_high_size" type="int" source="input_low_size"/>
|
219 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
220 |
+
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
221 |
+
<Tensor arg-name="input_high" type="input" port-index="2" format="BFYX"/>
|
222 |
+
<Tensor arg-name="output_low" type="input" port-index="3" format="BFYX"/>
|
223 |
+
<Tensor arg-name="output_high" type="input" port-index="4" format="BFYX"/>
|
224 |
+
<Scalar arg-name="input_high_size" type="int" source="input_high_size"/>
|
225 |
+
<Scalar arg-name="output_low_size" type="int" source="output_low_size"/>
|
226 |
+
<Scalar arg-name="output_high_size" type="int" source="output_high_size"/>
|
227 |
+
<Data arg-name="src_local" type="local_data" dim="input,0" size="X*Y*2"/>
|
228 |
+
<Data arg-name="dst_local" type="local_data" dim="input,0" size="X*Y*2"/>
|
229 |
+
</Parameters>
|
230 |
+
<WorkSizes dim="input,0" global="1,1,F" local="1,1,1"/>
|
231 |
+
</Kernel>
|
232 |
+
</CustomLayer>
|
233 |
+
|
234 |
+
<CustomLayer name="BinaryConvolution" type="MVCL" version="1">
|
235 |
+
<Where kernel="3,3"/>
|
236 |
+
<Kernel entry="binary_convolution">
|
237 |
+
<Source filename="binary_convolution3x3.bin"/>
|
238 |
+
<Parameters>
|
239 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
240 |
+
<Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
|
241 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
242 |
+
<Scalar arg-name="pad_value" type="float" source="pad_value"/>
|
243 |
+
<Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
|
244 |
+
<Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
|
245 |
+
<Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
|
246 |
+
<Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
|
247 |
+
<Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
|
248 |
+
<Scalar arg-name="GC" type="int" source="group"/>
|
249 |
+
<Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
|
250 |
+
<Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
|
251 |
+
<Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
|
252 |
+
<Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
|
253 |
+
<Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
|
254 |
+
<Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
|
255 |
+
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
256 |
+
</Parameters>
|
257 |
+
<WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
|
258 |
+
</Kernel>
|
259 |
+
</CustomLayer>
|
260 |
+
|
261 |
+
<CustomLayer name="BinaryConvolution" type="MVCL" version="1">
|
262 |
+
<Where kernel="1,1"/>
|
263 |
+
<Kernel entry="binary_convolution">
|
264 |
+
<Source filename="binary_convolution1x1.bin"/>
|
265 |
+
<Parameters>
|
266 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
267 |
+
<Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
|
268 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
269 |
+
<Scalar arg-name="pad_value" type="float" source="pad_value"/>
|
270 |
+
<Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
|
271 |
+
<Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
|
272 |
+
<Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
|
273 |
+
<Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
|
274 |
+
<Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
|
275 |
+
<Scalar arg-name="GC" type="int" source="group"/>
|
276 |
+
<Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
|
277 |
+
<Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
|
278 |
+
<Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
|
279 |
+
<Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
|
280 |
+
<Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
|
281 |
+
<Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
|
282 |
+
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
283 |
+
</Parameters>
|
284 |
+
<WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
|
285 |
+
</Kernel>
|
286 |
+
</CustomLayer>
|
287 |
+
|
288 |
+
<!-- Reference version of generic quantize binary convolution -->
|
289 |
+
<!-- An example of a kernel binding that uses data blob from IR -->
|
290 |
+
<CustomLayer name="BinaryConvolution" type="MVCL" version="1">
|
291 |
+
<Kernel entry="binary_convolution">
|
292 |
+
<Source filename="binary_convolution.bin"/>
|
293 |
+
<Parameters>
|
294 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
295 |
+
<Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
|
296 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
297 |
+
<Scalar arg-name="pad_value" type="float" source="pad_value"/>
|
298 |
+
<Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
|
299 |
+
<Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
|
300 |
+
<Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
|
301 |
+
<Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
|
302 |
+
<Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
|
303 |
+
<Scalar arg-name="GC" type="int" source="group"/>
|
304 |
+
<Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
|
305 |
+
<Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
|
306 |
+
<Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
|
307 |
+
<Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
|
308 |
+
<Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
|
309 |
+
<Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
|
310 |
+
</Parameters>
|
311 |
+
<WorkSizes dim="output,0" global="X,Y,F" local="1,1,1"/>
|
312 |
+
</Kernel>
|
313 |
+
</CustomLayer>
|
314 |
+
|
315 |
+
<CustomLayer name="Resample" type="MVCL" version="1">
|
316 |
+
<Where antialias="0"/>
|
317 |
+
<Kernel entry="resample_nearest">
|
318 |
+
<Source filename="resample_noAA.bin"/>
|
319 |
+
<Parameters>
|
320 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
321 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
322 |
+
<Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
|
323 |
+
<Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
|
324 |
+
<Scalar arg-name="factor" type="float" source="factor"/>
|
325 |
+
<Scalar arg-name="ow" type="int" port-index="0" source="O.X"/>
|
326 |
+
<Scalar arg-name="oh" type="int" port-index="0" source="O.Y"/>
|
327 |
+
<Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
|
328 |
+
</Parameters>
|
329 |
+
<WorkSizes global="1,Y,1" local="1,1,1" dim="output,0"/>
|
330 |
+
</Kernel>
|
331 |
+
</CustomLayer>
|
332 |
+
|
333 |
+
<CustomLayer name="Resample" type="MVCL" version="1">
|
334 |
+
<Where antialias="1"/>
|
335 |
+
<Kernel entry="resample_with_antialias">
|
336 |
+
<Source filename="resample_AA.bin"/>
|
337 |
+
<Parameters>
|
338 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
339 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
340 |
+
<Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
|
341 |
+
<Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
|
342 |
+
<Scalar arg-name="factor" type="float" source="factor"/>
|
343 |
+
<Scalar arg-name="ow" type="int" port-index="0" source="O.X"/>
|
344 |
+
<Scalar arg-name="oh" type="int" port-index="0" source="O.Y"/>
|
345 |
+
<Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
|
346 |
+
</Parameters>
|
347 |
+
<WorkSizes global="1,round(Y*factor),F" local="1,1,F" dim="input,0"/>
|
348 |
+
</Kernel>
|
349 |
+
</CustomLayer>
|
350 |
+
|
351 |
+
<CustomLayer name="Convolution" type="MVCL" version="1">
|
352 |
+
<Where kernel="1,1" dilation="1,1"/>
|
353 |
+
<Kernel entry="Convolution1x1_NCHW">
|
354 |
+
<Source filename="convolution1x1_chw.bin"/>
|
355 |
+
<Parameters>
|
356 |
+
<Tensor arg-name="in" type="input" port-index="0" format="BFYX"/>
|
357 |
+
<Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
|
358 |
+
<Data arg-name="w" type="data" source="weights" format="ANY"/>
|
359 |
+
<Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
|
360 |
+
<Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
|
361 |
+
<Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
|
362 |
+
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
363 |
+
<Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
|
364 |
+
<Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
|
365 |
+
|
366 |
+
<Scalar arg-name="stride-x" type="int" port-index="0" source="stride"/>
|
367 |
+
<Scalar arg-name="stride-y" type="int" port-index="1" source="stride"/>
|
368 |
+
<Scalar arg-name="pad-x" type="int" port-index="0" source="pads_begin"/>
|
369 |
+
<Scalar arg-name="pad-y" type="int" port-index="1" source="pads_begin"/>
|
370 |
+
<Scalar arg-name="kernel-x" type="int" port-index="0" source="kernel"/>
|
371 |
+
<Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
|
372 |
+
<Scalar arg-name="output" type="int" port-index="0" source="output"/>
|
373 |
+
<Scalar arg-name="group" type="int" port-index="0" source="group"/>
|
374 |
+
</Parameters>
|
375 |
+
<WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
|
376 |
+
</Kernel>
|
377 |
+
</CustomLayer>
|
378 |
+
|
379 |
+
<CustomLayer name="Convolution" type="MVCL" version="1">
|
380 |
+
<Where kernel="1,1" dilation="1,1"/>
|
381 |
+
<Kernel entry="Convolution1x1_NHWC">
|
382 |
+
<Source filename="convolution1x1_hwc.bin"/>
|
383 |
+
<Parameters>
|
384 |
+
<Tensor arg-name="in" type="input" port-index="0" format="BYXF"/>
|
385 |
+
<Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
|
386 |
+
<Data arg-name="w" type="data" source="weights" format="ANY"/>
|
387 |
+
<Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
|
388 |
+
<Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
|
389 |
+
<Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
|
390 |
+
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
391 |
+
<Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
|
392 |
+
<Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
|
393 |
+
|
394 |
+
<Scalar arg-name="stride-x" type="int" port-index="0" source="stride"/>
|
395 |
+
<Scalar arg-name="stride-y" type="int" port-index="1" source="stride"/>
|
396 |
+
<Scalar arg-name="pad-x" type="int" port-index="0" source="pads_begin"/>
|
397 |
+
<Scalar arg-name="pad-y" type="int" port-index="1" source="pads_begin"/>
|
398 |
+
<Scalar arg-name="kernel-x" type="int" port-index="0" source="kernel"/>
|
399 |
+
<Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
|
400 |
+
<Scalar arg-name="output" type="int" port-index="0" source="output"/>
|
401 |
+
<Scalar arg-name="group" type="int" port-index="0" source="group"/>
|
402 |
+
</Parameters>
|
403 |
+
<WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
|
404 |
+
</Kernel>
|
405 |
+
</CustomLayer>
|
406 |
+
|
407 |
+
<CustomLayer name="Convolution" type="MVCL" version="1">
|
408 |
+
<Where kernel="3,3" dilation="1,1"/>
|
409 |
+
<Kernel entry="Convolution3x3">
|
410 |
+
<Source filename="convolution3x3.bin"/>
|
411 |
+
<Parameters>
|
412 |
+
<Tensor arg-name="in_param" type="input" port-index="0" format="BFYX"/>
|
413 |
+
<Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
|
414 |
+
<Data arg-name="w" type="data" source="weights" format="BFYX"/>
|
415 |
+
<Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
|
416 |
+
<Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
|
417 |
+
<Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
|
418 |
+
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
419 |
+
<Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
|
420 |
+
<Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
|
421 |
+
<Scalar arg-name="KX" type="int" port-index="0" source="kernel"/>
|
422 |
+
<Scalar arg-name="KY" type="int" port-index="1" source="kernel"/>
|
423 |
+
<Scalar arg-name="stride_x" type="int" port-index="0" source="stride"/>
|
424 |
+
<Scalar arg-name="stride_y" type="int" port-index="1" source="stride"/>
|
425 |
+
<Scalar arg-name="pad_x" type="int" port-index="0" source="pads_begin"/>
|
426 |
+
<Scalar arg-name="pad_y" type="int" port-index="1" source="pads_begin"/>
|
427 |
+
<Scalar arg-name="dilation_x" type="int" port-index="0" source="dilation"/>
|
428 |
+
<Scalar arg-name="dilation_y" type="int" port-index="1" source="dilation"/>
|
429 |
+
<Scalar arg-name="output" type="int" port-index="0" source="output"/>
|
430 |
+
|
431 |
+
<Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*3*2"/>
|
432 |
+
<Data arg-name="out_local" type="local_data" dim="output,0" size="X*F*2"/>
|
433 |
+
<Data arg-name="w_local" type="local_data" dim="input,0" size="3*3*F*2"/>
|
434 |
+
</Parameters>
|
435 |
+
<WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
|
436 |
+
</Kernel>
|
437 |
+
</CustomLayer>
|
438 |
+
|
439 |
+
<CustomLayer name="ExperimentalDetectronPriorGridGenerator" type="MVCL" version="1">
|
440 |
+
<Kernel entry="experimental_detectron_prior_grid_generator">
|
441 |
+
<Source filename="detectron_prior_grid_gen.bin"/>
|
442 |
+
<Parameters>
|
443 |
+
<Tensor arg-name="input_priors" type="input" port-index="0" format="BFYX"/>
|
444 |
+
<Tensor arg-name="input_feature_map" type="input" port-index="1" format="BFYX"/>
|
445 |
+
<Tensor arg-name="input_rois" type="input" port-index="2" format="BFYX"/>
|
446 |
+
<Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
|
447 |
+
<Scalar arg-name="grid_h" type="int" port-index="1" source="I.Y"/>
|
448 |
+
<Scalar arg-name="grid_w" type="int" port-index="1" source="I.X"/>
|
449 |
+
<Scalar arg-name="stride_h" type="float" source="stride_h"/>
|
450 |
+
<Scalar arg-name="stride_w" type="float" source="stride_w"/>
|
451 |
+
<Scalar arg-name="num_priors" type="int" port-index="0" source="I.Y"/>
|
452 |
+
<Scalar arg-name="num_anchors_per_prior" type="int" port-index="0" source="I.X"/>
|
453 |
+
</Parameters>
|
454 |
+
<WorkSizes dim="input,1" global="((X+31)/32)*32,Y,1" local="32,1,1"/>
|
455 |
+
</Kernel>
|
456 |
+
</CustomLayer>
|
457 |
+
|
458 |
+
<CustomLayer name="Convert" type="MVCL" version="1">
|
459 |
+
<Kernel entry="cvtu8f16">
|
460 |
+
<Source filename="cvtu8f16.bin"/>
|
461 |
+
<Parameters>
|
462 |
+
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
463 |
+
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
464 |
+
<Scalar arg-name="scale" type="float" source="scale"/>
|
465 |
+
<Scalar arg-name="bias" type="float" source="bias"/>
|
466 |
+
</Parameters>
|
467 |
+
<WorkSizes dim="input,0" global="X,Y,F" local="X,1,1"/>
|
468 |
+
</Kernel>
|
469 |
+
</CustomLayer>
|
470 |
+
|
471 |
+
<CustomLayer name="Correlate" type="MVCL" version="1">
|
472 |
+
<Kernel entry="correlate2_half">
|
473 |
+
<Source filename="correlate.bin"/>
|
474 |
+
<Parameters>
|
475 |
+
<Tensor arg-name="bottom0" type="input" port-index="0" format="BFYX"/>
|
476 |
+
<Tensor arg-name="bottom1" type="input" port-index="1" format="BFYX"/>
|
477 |
+
<Tensor arg-name="top" type="output" port-index="0" format="BFYX"/>
|
478 |
+
<Scalar arg-name="topwidth" type="int" source="top_width"/>
|
479 |
+
<Scalar arg-name="topheight" type="int" source="top_height"/>
|
480 |
+
<Scalar arg-name="bottomwidth" type="int" port-index="0" source="I.X"/>
|
481 |
+
<Scalar arg-name="bottomheight" type="int" port-index="0" source="I.Y"/>
|
482 |
+
<Scalar arg-name="bottomchannels" type="int" port-index="0" source="I.F"/>
|
483 |
+
<Scalar arg-name="max_displacement" type="int" source="displacement"/>
|
484 |
+
<Scalar arg-name="padding" type="int" source="pad"/>
|
485 |
+
<Scalar arg-name="neighborhood_grid_radius" type="int" source="neighborhood_grid_radius"/>
|
486 |
+
<Scalar arg-name="neighborhood_grid_width" type="int" source="neighborhood_grid_width"/>
|
487 |
+
<Scalar arg-name="kernel_size" type="int" source="kernel_size"/>
|
488 |
+
<Scalar arg-name="stride1" type="int" port-index="0" source="stride"/>
|
489 |
+
<Scalar arg-name="stride2" type="int" port-index="1" source="stride"/>
|
490 |
+
</Parameters>
|
491 |
+
<WorkSizes dim="input,0" global="top_height,1,1" local="1,1,1"/>
|
492 |
+
</Kernel>
|
493 |
+
</CustomLayer>
|
494 |
+
|
495 |
+
<CustomLayer name="SpatialTransform" type="MVCL" version="1">
|
496 |
+
<Kernel entry="ocl_st">
|
497 |
+
<Source filename="st.bin"/>
|
498 |
+
<Parameters>
|
499 |
+
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
500 |
+
<Tensor arg-name="theta" type="input" port-index="1" format="ANY"/>
|
501 |
+
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
502 |
+
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
503 |
+
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
504 |
+
</Parameters>
|
505 |
+
<WorkSizes dim="input,0" global="(X+511)/512,Y,1" local="1,1,1"/>
|
506 |
+
</Kernel>
|
507 |
+
</CustomLayer>
|
openvino/vpu_custom_kernels/cvtf32f16.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:703ef56f84299e76d36b3ba5a632ae3d5e3ecd54761dcfe0006ca69ddce4bc6d
|
3 |
+
size 2664
|
openvino/vpu_custom_kernels/cvtf32f16.cl
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
|
7 |
+
__kernel void cvtf32f16(const __global float* restrict inImage,
|
8 |
+
__global half* restrict outImage,
|
9 |
+
float scale,
|
10 |
+
float bais)
|
11 |
+
{
|
12 |
+
int idx = get_global_id(0)
|
13 |
+
+ get_global_id(1) * get_global_size(0)
|
14 |
+
+ get_global_id(2) * get_global_size(0) * get_global_size(1);
|
15 |
+
|
16 |
+
outImage[idx] = convert_half(inImage[idx]*scale+bais);
|
17 |
+
}
|
openvino/vpu_custom_kernels/cvtu8f16.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46c943e08f37cedac77f727f55835637d4878edcc20aaa24f16ed5888d13bd43
|
3 |
+
size 4588
|
openvino/vpu_custom_kernels/cvtu8f16.cl
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias)
|
9 |
+
{
|
10 |
+
__local uchar local_src[8 * 1024];
|
11 |
+
__local half local_dst[8 * 1024];
|
12 |
+
|
13 |
+
event_t e1 = async_work_group_copy_3D3D(
|
14 |
+
local_src, // dst
|
15 |
+
src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
|
16 |
+
+ get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src
|
17 |
+
get_local_size(0), // num_elements_per_line
|
18 |
+
get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines
|
19 |
+
get_global_size(0) - get_local_size(0), // src_line_stride
|
20 |
+
0, // dst_line_stride
|
21 |
+
get_local_size(2), // num planes
|
22 |
+
get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride
|
23 |
+
0, // dst plane stride
|
24 |
+
0);
|
25 |
+
wait_group_events(1, &e1);
|
26 |
+
|
27 |
+
size_t idx = get_local_id(0)
|
28 |
+
+ get_local_id(1) * get_local_size(0)
|
29 |
+
+ get_local_id(2) * get_local_size(0) * get_local_size(1);
|
30 |
+
|
31 |
+
local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias;
|
32 |
+
|
33 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
34 |
+
|
35 |
+
event_t e2 = async_work_group_copy_3D3D(
|
36 |
+
dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
|
37 |
+
+ get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst
|
38 |
+
local_dst, // src
|
39 |
+
get_local_size(0), // num_elements_per_line
|
40 |
+
get_local_size(1), // num_lines
|
41 |
+
0, // src_line_stride
|
42 |
+
get_global_size(0) - get_local_size(0), // dst_line_stride
|
43 |
+
get_local_size(2), // num_planes
|
44 |
+
0, // src_plane_stride
|
45 |
+
get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
|
46 |
+
0);
|
47 |
+
wait_group_events(1, &e2);
|
48 |
+
}
|
openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c80d556d23f1c959fa10c00ff1cd9c3ae10aba607b37c7a0620d903fc7cedd8
|
3 |
+
size 6972
|
openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void experimental_detectron_prior_grid_generator(
|
9 |
+
__global const half *restrict input_priors,
|
10 |
+
__global const half *restrict input_feature_map,
|
11 |
+
__global const half *restrict input_rois,
|
12 |
+
__global half *restrict output,
|
13 |
+
int grid_h,
|
14 |
+
int grid_w,
|
15 |
+
float stride_h,
|
16 |
+
float stride_w,
|
17 |
+
int num_priors,
|
18 |
+
int num_anchors_per_prior)
|
19 |
+
{
|
20 |
+
__local half local_input_priors[8 * 1024];
|
21 |
+
__local half local_output[8 * 1024];
|
22 |
+
|
23 |
+
event_t e1 = async_work_group_copy(
|
24 |
+
local_input_priors,
|
25 |
+
input_priors,
|
26 |
+
num_anchors_per_prior * num_priors,
|
27 |
+
0);
|
28 |
+
wait_group_events(1, &e1);
|
29 |
+
|
30 |
+
int width_start = get_group_id(0) * get_local_size(0);
|
31 |
+
int width_end = min(width_start + get_local_size(0), (unsigned)grid_w);
|
32 |
+
int width = width_end - width_start;
|
33 |
+
|
34 |
+
int h = get_group_id(1);
|
35 |
+
int w_idx = get_group_id(0) * get_local_size(0);
|
36 |
+
for (int w = 0; w < width; ++w) {
|
37 |
+
#pragma unroll 4
|
38 |
+
for (int p = 0; p < num_priors; ++p) {
|
39 |
+
local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
|
40 |
+
local_input_priors[4 * p + 0]
|
41 |
+
+ convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
|
42 |
+
local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
|
43 |
+
local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
|
44 |
+
local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
|
45 |
+
local_input_priors[4 * p + 2]
|
46 |
+
+ convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
|
47 |
+
local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
|
48 |
+
local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
|
49 |
+
}
|
50 |
+
}
|
51 |
+
|
52 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
53 |
+
|
54 |
+
event_t e2 = async_work_group_copy_2D2D(
|
55 |
+
output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
|
56 |
+
+ get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
|
57 |
+
* num_priors, // dst
|
58 |
+
local_output, // src
|
59 |
+
width * num_anchors_per_prior * num_priors, // num_elements_per_line
|
60 |
+
1, // num_lines
|
61 |
+
(grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
|
62 |
+
(grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
|
63 |
+
0);
|
64 |
+
wait_group_events(1, &e2);
|
65 |
+
}
|
openvino/vpu_custom_kernels/fakequantize.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d17659bbf12a172849085003a055bfb4b91d3bb5bdc7f820395820eaa90b46ef
|
3 |
+
size 15688
|
openvino/vpu_custom_kernels/fakequantize.cl
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (C) 2018-2022 Intel Corporation
|
2 |
+
// SPDX-License-Identifier: Apache-2.0
|
3 |
+
//
|
4 |
+
|
5 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
6 |
+
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
7 |
+
|
8 |
+
__kernel void quantize(
|
9 |
+
__global const half *restrict src_data,
|
10 |
+
__global const half *restrict input_low,
|
11 |
+
__global const half *restrict input_high,
|
12 |
+
__global const half *restrict output_low,
|
13 |
+
__global const half *restrict output_high,
|
14 |
+
__global half *restrict dst_data,
|
15 |
+
int levels,
|
16 |
+
int input_low_size,
|
17 |
+
int input_high_size,
|
18 |
+
int output_low_size,
|
19 |
+
int output_high_size,
|
20 |
+
int W,
|
21 |
+
int H)
|
22 |
+
{
|
23 |
+
__local half local_src[15 * 1024];
|
24 |
+
__local half local_dst[15 * 1024];
|
25 |
+
|
26 |
+
event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
|
27 |
+
wait_group_events(1, &e1);
|
28 |
+
|
29 |
+
int c = get_group_id(2);
|
30 |
+
|
31 |
+
half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]);
|
32 |
+
half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
|
33 |
+
half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]);
|
34 |
+
half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
|
35 |
+
|
36 |
+
half const1 = (half)(
|
37 |
+
!(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
|
38 |
+
half const2 =
|
39 |
+
(half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
|
40 |
+
|
41 |
+
__local const half *restrict src = local_src + W * get_local_id(1);
|
42 |
+
__local half *restrict dst = local_dst + W * get_local_id(1);
|
43 |
+
|
44 |
+
for (int w = 0; w < W / 8; w++) {
|
45 |
+
half8 val = *((__local half8 *)src + w);
|
46 |
+
half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
|
47 |
+
|
48 |
+
aux = (half8){
|
49 |
+
(half)(short)(aux.s0),
|
50 |
+
(half)(short)(aux.s1),
|
51 |
+
(half)(short)(aux.s2),
|
52 |
+
(half)(short)(aux.s3),
|
53 |
+
(half)(short)(aux.s4),
|
54 |
+
(half)(short)(aux.s5),
|
55 |
+
(half)(short)(aux.s6),
|
56 |
+
(half)(short)(aux.s7)};
|
57 |
+
|
58 |
+
aux = aux * (half8)const2 + (half8)h_olow;
|
59 |
+
|
60 |
+
short8 a;
|
61 |
+
short8 b;
|
62 |
+
a.s0 = (val.s0 <= h_ilow);
|
63 |
+
a.s1 = (val.s1 <= h_ilow);
|
64 |
+
a.s2 = (val.s2 <= h_ilow);
|
65 |
+
a.s3 = (val.s3 <= h_ilow);
|
66 |
+
a.s4 = (val.s4 <= h_ilow);
|
67 |
+
a.s5 = (val.s5 <= h_ilow);
|
68 |
+
a.s6 = (val.s6 <= h_ilow);
|
69 |
+
a.s7 = (val.s7 <= h_ilow);
|
70 |
+
|
71 |
+
b.s0 = (val.s0 > h_ihigh);
|
72 |
+
b.s1 = (val.s1 > h_ihigh);
|
73 |
+
b.s2 = (val.s2 > h_ihigh);
|
74 |
+
b.s3 = (val.s3 > h_ihigh);
|
75 |
+
b.s4 = (val.s4 > h_ihigh);
|
76 |
+
b.s5 = (val.s5 > h_ihigh);
|
77 |
+
b.s6 = (val.s6 > h_ihigh);
|
78 |
+
b.s7 = (val.s7 > h_ihigh);
|
79 |
+
|
80 |
+
a = ~(a - (short8)1);
|
81 |
+
b = ~(b - (short8)1);
|
82 |
+
|
83 |
+
short8 c1 = (~a & b);
|
84 |
+
short8 c2 = (~a & ~b);
|
85 |
+
|
86 |
+
short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux));
|
87 |
+
|
88 |
+
*((__local half8 *)dst + w) = as_half8(res);
|
89 |
+
}
|
90 |
+
|
91 |
+
for (int w = W & (~0x7); w < W; w++) {
|
92 |
+
half val = src[w];
|
93 |
+
short a = val <= h_ilow;
|
94 |
+
a = ~(a - 1);
|
95 |
+
short b = val > h_ihigh;
|
96 |
+
b = ~(b - 1);
|
97 |
+
|
98 |
+
short c1 = (~a & b);
|
99 |
+
short c2 = (~a & ~b);
|
100 |
+
|
101 |
+
short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh))
|
102 |
+
| (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow)));
|
103 |
+
|
104 |
+
dst[w] = as_half(res);
|
105 |
+
}
|
106 |
+
|
107 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
108 |
+
|
109 |
+
event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
|
110 |
+
wait_group_events(1, &e2);
|
111 |
+
}
|
openvino/vpu_custom_kernels/grn.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e3dbe5173ca93f39fecaf29f820e1704bcb485affc1a09554e4c86f8de46214
|
3 |
+
size 7972
|