Spaces:
Runtime error
Runtime error
// Copyright (C) 2018-2022 Intel Corporation | |
// SPDX-License-Identifier: Apache-2.0 | |
// | |
inline int out_to_in(float ox, float f) | |
{ | |
return (int)((ox + 0.5f) / f); | |
return ROUND((ox + 0.5f) / f - 0.5f); | |
} | |
static inline float triangleCoeff(float x) { return 1.0f - fabs(x); } | |
static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); } | |
__kernel void resample_with_antialias( | |
__global const half *restrict src, | |
__global half *restrict dst, | |
int iw, | |
int ih, | |
float factor, | |
int ow, | |
int oh, | |
int channels) | |
{ | |
__local half local_src[20 * 1024]; | |
__local half local_dst[8 * 1024]; | |
const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor); | |
const int oy_first = get_group_id(1) * get_local_size(1); | |
const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1; | |
const int iy_first = max(out_to_in(oy_first, factor) - r, 0); | |
const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1); | |
const int iy_size = iy_last - iy_first + 1; | |
event_t e1 = async_work_group_copy_2D2D( | |
local_src, // dst | |
src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src | |
iy_size * iw, // num_elements_per_line, | |
get_local_size(2), // num_lines, | |
(ih - iy_size) * iw, // src_line_stride, | |
0, // dst_line_stride, | |
0); | |
wait_group_events(1, &e1); | |
const int oy = get_global_id(1); | |
const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first; | |
const int iy = ROUND(iy_f); | |
__local half const *restrict start_src = | |
local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2); | |
__local half *restrict start_dst = | |
local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2); | |
for (int ox = 0; ox < ow; ox++) { | |
const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f; | |
const int ix_i = ROUND(ix_f); | |
float4 v_sum = 0.f; | |
float4 v_wsum = 0.f; | |
for (int y = 0; y < iy_size; y++) { | |
float dy = iy_f - y; | |
int x = max(ix_i - r, 0); | |
int end_x = min(ix_i + r, iw - 1); | |
float4 dx; | |
for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i; | |
for (; x < end_x - 3; x += 4, dx -= 4) { | |
float4 w = | |
factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy); | |
float4 src_vec = { | |
start_src[y * iw + x + 0], | |
start_src[y * iw + x + 1], | |
start_src[y * iw + x + 2], | |
start_src[y * iw + x + 3]}; | |
v_sum += w * src_vec; | |
v_wsum += w; | |
} | |
for (; x <= end_x; x++) { | |
float dx = ix_f - x; | |
float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy); | |
v_sum[0] += w * start_src[y * iw + x]; | |
v_wsum[0] += w; | |
} | |
} | |
v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3]; | |
v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3]; | |
start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
event_t e2 = async_work_group_copy_2D2D( | |
dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow | |
+ get_group_id(1) * get_local_size(1) * ow, // dst | |
local_dst, // src | |
get_local_size(1) * ow, // num_elements_per_line, | |
get_local_size(2), // num_lines, | |
0, // src_line_stride, | |
(get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride, | |
0); | |
wait_group_events(1, &e2); | |
} | |