30 #define TYPE_NAME "vec4"
32 #define TYPE_SIZE (TYPE_ELEMS*4)
68 GLSLF(4, s1 = imageLoad(input_img[%
i],
pos + ivec2(%
i + %
s, %
i + %
s))[%
i];
69 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
71 GLSLF(4, s2[0] = imageLoad(input_img[%
i],
pos + offs[0] + ivec2(%
i + %
s, %
i + %
s))[%
i];
72 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
73 GLSLF(4, s2[1] = imageLoad(input_img[%
i],
pos + offs[1] + ivec2(%
i + %
s, %
i + %
s))[%
i];
74 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
75 GLSLF(4, s2[2] = imageLoad(input_img[%
i],
pos + offs[2] + ivec2(%
i + %
s, %
i + %
s))[%
i];
76 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
77 GLSLF(4, s2[3] = imageLoad(input_img[%
i],
pos + offs[3] + ivec2(%
i + %
s, %
i + %
s))[%
i];
78 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
80 GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
85 GLSLF(1,
pos.y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
87 GLSLC(1, barrier(); );
90 GLSLC(2, #pragma unroll(1) );
91 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
92 GLSLC(3, prefix_sum = DTYPE(0); );
93 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
94 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
96 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
98 insert_first(shd, 0,
"r", 0, plane, comp);
100 GLSLC(4, s2 = dst.v[pos.x]; );
101 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
102 GLSLC(4, prefix_sum += s2; );
111 GLSLF(1,
pos.x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
112 GLSLC(1, #pragma unroll(1) );
113 GLSLF(1,
for (
r = 0;
r < %
i;
r++) ,nb_rows);
114 GLSLC(2, psum[
r] = DTYPE(0); );
117 GLSLC(1, barrier(); );
120 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
121 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
122 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
124 GLSLC(3, #pragma unroll(1) );
125 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
127 insert_first(shd, 0,
"r", 1, plane, comp);
129 GLSLC(4, s2 = dst.v[pos.x + r]; );
130 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
131 GLSLC(4, psum[r] += s2; );
139 int t,
int dst_comp,
int plane,
int comp)
141 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
143 GLSLC(1, barrier(); );
147 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
149 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
150 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
152 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
153 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
155 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
156 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
159 GLSLC(3, a = DTYPE(0); );
160 GLSLC(3, b = DTYPE(0); );
161 GLSLC(3, c = DTYPE(0); );
162 GLSLC(3, d = DTYPE(0); );
164 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
166 GLSLF(3, src[0] = imageLoad(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
167 GLSLF(3, src[1] = imageLoad(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
168 GLSLF(3, src[2] = imageLoad(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
169 GLSLF(3, src[3] = imageLoad(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
171 GLSLC(3, if (lt == false) { );
172 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
173 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
174 GLSLC(4, a = dst.v[pos.x - p]; );
175 GLSLC(4, c = dst.v[pos.x + p]; );
176 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
177 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
178 GLSLC(4, b = dst.v[pos.x - p]; );
179 GLSLC(4, d = dst.v[pos.x + p]; );
182 GLSLC(3, patch_diff = d + a - b - c; );
183 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
184 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
185 GLSLC(3, sum = dot(w, src*255); );
188 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
189 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
191 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
192 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
198 typedef struct HorizontalPushData {
201 uint32_t ws_stride[4];
204 VkDeviceAddress integral_base;
205 uint64_t integral_size;
207 uint32_t xyoffs_start;
208 } HorizontalPushData;
220 void *spv_opaque =
NULL;
223 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
224 int wg_size, wg_rows;
230 if (max_wg > max_dim) {
232 }
else if (max_wg < max_dim) {
234 while (wg_size*wg_rows < max_dim)
239 VK_SHADER_STAGE_COMPUTE_BIT,
240 (
const char *[]) {
"GL_EXT_buffer_reference",
241 "GL_EXT_buffer_reference2" }, 2,
248 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
249 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
254 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
255 GLSLC(1, DTYPE v[]; );
258 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
261 GLSLC(1, uvec4 ws_stride; );
262 GLSLC(1, ivec4 patch_size; );
263 GLSLC(1, vec4 strength; );
264 GLSLC(1, DataBuffer integral_base; );
265 GLSLC(1, uint64_t integral_size; );
266 GLSLC(1, uint64_t int_stride; );
267 GLSLC(1, uint xyoffs_start; );
272 VK_SHADER_STAGE_COMPUTE_BIT);
277 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
279 .mem_quali =
"readonly",
282 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
285 .name =
"weights_buffer_0",
286 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
287 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
288 .buf_content =
"float weights_0[];",
291 .name =
"sums_buffer_0",
292 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
293 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
294 .buf_content =
"float sums_0[];",
297 .name =
"weights_buffer_1",
298 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
299 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
300 .buf_content =
"float weights_1[];",
303 .name =
"sums_buffer_1",
304 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
305 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
306 .buf_content =
"float sums_1[];",
309 .name =
"weights_buffer_2",
310 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
311 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
312 .buf_content =
"float weights_2[];",
315 .name =
"sums_buffer_2",
316 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
317 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
318 .buf_content =
"float sums_2[];",
321 .name =
"weights_buffer_3",
322 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
323 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
324 .buf_content =
"float weights_3[];",
327 .name =
"sums_buffer_3",
328 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
329 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
330 .buf_content =
"float sums_3[];",
337 .
name =
"xyoffsets_buffer",
338 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
339 .mem_quali =
"readonly",
340 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
341 .buf_content =
"ivec2 xyoffsets[];",
351 GLSLC(1,
float s1; );
352 GLSLC(1, DTYPE s2; );
353 GLSLC(1, DTYPE prefix_sum; );
354 GLSLF(1, DTYPE psum[%
i]; ,*nb_rows);
359 GLSLC(1, DataBuffer integral_data; );
362 GLSLC(1,
int invoc_idx =
int(gl_WorkGroupID.z); );
365 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
374 GLSLC(1, DTYPE patch_diff; );
382 GLSLC(1,
float w_sum; );
383 GLSLC(1,
float sum; );
389 for (
int i = 0;
i <
desc->nb_components;
i++) {
404 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
416 typedef struct DenoisePushData {
417 uint32_t ws_stride[4];
427 void *spv_opaque =
NULL;
431 VK_SHADER_STAGE_COMPUTE_BIT,
432 (
const char *[]) {
"GL_EXT_buffer_reference",
433 "GL_EXT_buffer_reference2" }, 2,
437 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
438 GLSLC(1, uvec4 ws_stride; );
442 VK_SHADER_STAGE_COMPUTE_BIT);
447 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
449 .mem_quali =
"readonly",
452 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
455 .name =
"output_img",
456 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
458 .mem_quali =
"writeonly",
461 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
468 .
name =
"weights_buffer_0",
469 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
470 .mem_quali =
"readonly",
471 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
472 .buf_content =
"float weights_0[];",
475 .name =
"sums_buffer_0",
476 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
477 .mem_quali =
"readonly",
478 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
479 .buf_content =
"float sums_0[];",
482 .name =
"weights_buffer_1",
483 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
484 .mem_quali =
"readonly",
485 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
486 .buf_content =
"float weights_1[];",
489 .name =
"sums_buffer_1",
490 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
491 .mem_quali =
"readonly",
492 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
493 .buf_content =
"float sums_1[];",
496 .name =
"weights_buffer_2",
497 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
498 .mem_quali =
"readonly",
499 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
500 .buf_content =
"float weights_2[];",
503 .name =
"sums_buffer_2",
504 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
505 .mem_quali =
"readonly",
506 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
507 .buf_content =
"float sums_2[];",
510 .name =
"weights_buffer_3",
511 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
512 .mem_quali =
"readonly",
513 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
514 .buf_content =
"float weights_3[];",
517 .name =
"sums_buffer_3",
518 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
519 .mem_quali =
"readonly",
520 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
521 .buf_content =
"float sums_3[];",
530 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
531 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
533 GLSLC(1,
float w_sum; );
534 GLSLC(1,
float sum; );
538 GLSLC(1,
size = imageSize(output_img[plane]); );
542 GLSLC(1,
src = imageLoad(input_img[plane],
pos); );
544 for (
int c = 0;
c <
desc->nb_components;
c++) {
549 GLSLF(2,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
553 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
556 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
571 int xcnt = 0, ycnt = 0;
577 int offsets_dispatched = 0, nb_dispatches = 0;
584 if (!(
s->opts.r & 1)) {
590 if (!(
s->opts.p & 1)) {
596 for (
int i = 0;
i < 4;
i++) {
597 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
598 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
601 str = 255.0*255.0 / str;
602 s->strength[
i] = str;
608 s->patch[
i] = ps / 2;
612 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
613 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
614 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
617 for (
int x = -rad; x <= rad; x++) {
618 for (
int y = -rad; y <= rad; y++) {
622 s->xoffsets[xcnt++] = x;
623 s->yoffsets[ycnt++] = y;
629 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
630 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
631 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
632 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
635 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
636 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
637 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
645 "disabling dispatch parallelism\n");
649 spv = ff_vk_spirv_init();
664 RET(init_weights_pipeline(vkctx, &
s->e, &
s->shd_weights,
665 spv,
s->vkctx.output_width,
s->vkctx.output_height,
668 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->shd_denoise,
673 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
674 VK_FORMAT_UNDEFINED));
677 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
678 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
681 }
while (offsets_dispatched < s->nb_offsets);
684 s->nb_offsets, nb_dispatches);
700 VkBufferMemoryBarrier2 buf_bar[8];
703 DenoisePushData pd = {
704 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
712 VK_SHADER_STAGE_COMPUTE_BIT,
715 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
716 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
717 .srcStageMask = ws_vk->
stage,
718 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
719 .srcAccessMask = ws_vk->
access,
720 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
721 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
722 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
723 .buffer = ws_vk->
buf,
728 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
729 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
730 .pBufferMemoryBarriers = buf_bar,
731 .bufferMemoryBarrierCount = nb_buf_bar,
733 ws_vk->
stage = buf_bar[0].dstStageMask;
734 ws_vk->
access = buf_bar[0].dstAccessMask;
737 vk->CmdDispatch(exec->
buf,
757 int plane_heights[4];
759 int offsets_dispatched = 0;
770 VkDeviceSize weights_offs[4];
771 VkDeviceSize sums_offs[4];
772 uint32_t ws_stride[4];
774 size_t ws_total_size = 0;
779 VkImageMemoryBarrier2 img_bar[8];
781 VkBufferMemoryBarrier2 buf_bar[8];
792 int_stride =
s->shd_weights.lg_size[0]*
s->pl_weights_rows*
TYPE_SIZE;
793 int_size =
s->shd_weights.lg_size[0]*
s->pl_weights_rows*int_stride;
796 for (
int i = 0;
i <
desc->nb_components;
i++) {
799 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->shd_denoise.lg_size[0]);
800 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->shd_denoise.lg_size[1]);
802 ws_stride[
i] = plane_widths[
i];
803 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
804 ws_total_size += ws_size[
i];
809 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
810 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
812 s->opts.t * int_size,
813 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
819 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
820 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
821 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
824 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
830 sums_offs[0] = ws_total_size;
831 for (
int i = 1;
i <
desc->nb_components;
i++) {
832 weights_offs[
i] = weights_offs[
i - 1] + ws_size[
i - 1];
833 sums_offs[
i] = sums_offs[
i - 1] + ws_size[
i - 1];
849 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
850 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
852 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
853 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
864 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
865 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
866 VK_ACCESS_SHADER_READ_BIT,
867 VK_IMAGE_LAYOUT_GENERAL,
868 VK_QUEUE_FAMILY_IGNORED);
873 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
874 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
875 VK_ACCESS_SHADER_WRITE_BIT,
876 VK_IMAGE_LAYOUT_GENERAL,
877 VK_QUEUE_FAMILY_IGNORED);
880 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
881 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
882 .srcStageMask = ws_vk->
stage,
883 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
884 .srcAccessMask = ws_vk->
access,
885 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
886 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
887 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
888 .buffer = ws_vk->
buf,
892 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
893 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
894 .srcStageMask = integral_vk->
stage,
895 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
896 .srcAccessMask = integral_vk->
access,
897 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
898 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
899 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
900 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
901 .buffer = integral_vk->
buf,
902 .size = integral_vk->
size,
906 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
907 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
908 .pImageMemoryBarriers = img_bar,
909 .imageMemoryBarrierCount = nb_img_bar,
910 .pBufferMemoryBarriers = buf_bar,
911 .bufferMemoryBarrierCount = nb_buf_bar,
913 ws_vk->
stage = buf_bar[0].dstStageMask;
914 ws_vk->
access = buf_bar[0].dstAccessMask;
915 integral_vk->
stage = buf_bar[1].dstStageMask;
916 integral_vk->
access = buf_bar[1].dstAccessMask;
919 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
922 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
923 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
924 .srcStageMask = ws_vk->
stage,
925 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
926 .srcAccessMask = ws_vk->
access,
927 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
928 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
929 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
930 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
931 .buffer = ws_vk->
buf,
936 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
937 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
938 .pBufferMemoryBarriers = buf_bar,
939 .bufferMemoryBarrierCount = nb_buf_bar,
941 ws_vk->
stage = buf_bar[0].dstStageMask;
942 ws_vk->
access = buf_bar[0].dstAccessMask;
946 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
947 for (
int i = 0;
i <
desc->nb_components;
i++) {
949 ws_vk, weights_offs[
i], ws_size[
i],
950 VK_FORMAT_UNDEFINED));
952 ws_vk, sums_offs[
i], ws_size[
i],
953 VK_FORMAT_UNDEFINED));
958 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
960 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
961 for (
int i = 0;
i <
desc->nb_components;
i++) {
963 ws_vk, weights_offs[
i], ws_size[
i],
964 VK_FORMAT_UNDEFINED));
966 ws_vk, sums_offs[
i], ws_size[
i],
967 VK_FORMAT_UNDEFINED));
975 HorizontalPushData pd = {
976 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
977 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
978 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
979 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
980 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
983 (uint64_t)int_stride,
989 VK_SHADER_STAGE_COMPUTE_BIT,
992 if (offsets_dispatched) {
994 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
995 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
996 .srcStageMask = integral_vk->
stage,
997 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
998 .srcAccessMask = integral_vk->
access,
999 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
1000 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
1001 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1002 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1003 .buffer = integral_vk->
buf,
1004 .size = integral_vk->
size,
1008 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1009 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1010 .pBufferMemoryBarriers = buf_bar,
1011 .bufferMemoryBarrierCount = nb_buf_bar,
1013 integral_vk->
stage = buf_bar[1].dstStageMask;
1014 integral_vk->
access = buf_bar[1].dstAccessMask;
1018 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
1021 vk->CmdDispatch(exec->
buf, 1, 1, wg_invoc);
1024 }
while (offsets_dispatched < s->nb_offsets);
1026 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1068 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1069 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1070 static const AVOption nlmeans_vulkan_options[] = {
1091 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1095 .filter_frame = &nlmeans_vulkan_filter_frame,
1100 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1109 .
p.
name =
"nlmeans_vulkan",
1111 .p.priv_class = &nlmeans_vulkan_class,
1115 .
uninit = &nlmeans_vulkan_uninit,