beamformer_core.c (67624B)
1 /* See LICENSE for license details. */ 2 /* TODO(rnp): 3 * [ ]: backtrace dumping on SIGSEGV 4 * [ ]: bug? HERCULES might be broken, we may need to to chunk on transmits instead of channels 5 * [ ]: refactor: do_compute should build its own "command graph" which tracks 6 * dependencies better. It is very important that unnecessary barriers are 7 * not placed between compute stages which requires knowledge of the entire 8 * graph. 9 * [ ]: refactor: replace UploadRF with just the scratch_rf_size variable, 10 * use below to spin wait in library 11 * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64) 12 * for power efficient low latency waiting 13 * [ ]: BeamformWorkQueue -> BeamformerWorkQueue 14 * [ ]: refactor: work queue needs a cleanup, we should only have a single one 15 * - that queue isn't really considered hot so a lock is probably fine 16 * [ ]: bug: reinit cuda on hot-reload 17 */ 18 19 #include "compiler.h" 20 21 #if defined(BEAMFORMER_DEBUG) && !defined(BEAMFORMER_EXPORT) && OS_WINDOWS 22 #define BEAMFORMER_EXPORT __declspec(dllexport) 23 #endif 24 25 #include "beamformer_internal.h" 26 27 global f32 dt_for_frame; 28 29 typedef struct BeamformerComputeGraphNode BeamformerComputeGraphNode; 30 struct BeamformerComputeGraphNode { 31 // NOTE(rnp): will be BeamformerShaderKind_Count for root node 32 BeamformerShaderKind kind; 33 34 // NOTE(rnp): when any of input or output stride is assigned it is assumed that 35 // the shader requires a fixed layout for input, output, or both. When two adjacent 36 // nodes require incompatible layouts the second pass over the graph will insert 37 // Reshape shaders in between. 38 BeamformerDataKind input_data_kind; 39 iv3 input_stride; 40 41 BeamformerDataKind output_data_kind; 42 iv3 output_stride; 43 44 i32 user_pipeline_index; 45 46 BeamformerComputeGraphNode *prev; 47 BeamformerComputeGraphNode *next; 48 }; 49 50 typedef struct { 51 BeamformerComputeGraphNode *first; 52 BeamformerComputeGraphNode *last; 53 u64 count; 54 } BeamformerComputeGraph; 55 56 read_only global u32 beamformer_compute_array_parameter_sizes[] = { 57 #define X(k, type, elements) sizeof(type) * elements, 58 BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST 59 #undef X 60 }; 61 62 read_only global u32 beamformer_compute_array_parameter_offsets[] = { 63 #define X(k, ...) offsetof(BeamformerComputeArrayParameters, k), 64 BEAMFORMER_COMPUTE_ARRAY_PARAMETERS_LIST 65 #undef X 66 }; 67 68 function void 69 beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block) 70 { 71 assert(block < countof(cc->compute_plans)); 72 BeamformerComputePlan *cp = cc->compute_plans[block]; 73 if (cp) { 74 vk_buffer_release(&cp->array_parameters); 75 for (u32 i = 0; i < countof(cp->filters); i++) 76 vk_buffer_release(&cp->filters[i].buffer); 77 cc->compute_plans[block] = 0; 78 SLLPushFreelist(cp, cc->compute_plan_freelist); 79 } 80 } 81 82 function BeamformerComputePlan * 83 beamformer_compute_plan_for_block(BeamformerComputeContext *cc, u32 block, Arena *arena) 84 { 85 assert(block < countof(cc->compute_plans)); 86 BeamformerComputePlan *result = cc->compute_plans[block]; 87 if (!result) { 88 result = SLLPopFreelist(cc->compute_plan_freelist); 89 if (!result) result = push_struct_no_zero(arena, BeamformerComputePlan); 90 zero_struct(result); 91 cc->compute_plans[block] = result; 92 93 result->ui_voxel_transform = m4_identity(); 94 95 Stream label = arena_stream(*arena); 96 stream_append_s8(&label, s8("ComputeParameterArray[")); 97 stream_append_u64(&label, block); 98 stream_append_s8(&label, s8("]")); 99 stream_append_byte(&label, 0); 100 101 GPUBufferAllocateInfo allocate_info = { 102 .size = sizeof(BeamformerComputeArrayParameters), 103 .flags = VulkanUsageFlag_HostReadWrite, 104 .label = stream_to_str8(&label), 105 }; 106 vk_buffer_allocate(&result->array_parameters, &allocate_info); 107 assert((result->array_parameters.gpu_pointer & 63) == 0); 108 } 109 return result; 110 } 111 112 function void 113 beamformer_filter_update(BeamformerFilter *f, BeamformerFilterParameters fp, u32 block, u32 slot, Arena arena) 114 { 115 Stream sb = arena_stream(arena); 116 stream_append_s8s(&sb, 117 beamformer_filter_kind_strings[fp.kind % countof(beamformer_filter_kind_strings)], 118 s8("Filter[")); 119 stream_append_u64(&sb, block); 120 stream_append_s8(&sb, s8("][")); 121 stream_append_u64(&sb, slot); 122 stream_append_byte(&sb, ']'); 123 s8 label = arena_stream_commit(&arena, &sb); 124 125 void *filter = 0; 126 switch (fp.kind) { 127 case BeamformerFilterKind_Kaiser:{ 128 /* TODO(rnp): this should also support complex */ 129 /* TODO(rnp): implement this as an IFIR filter instead to reduce computation */ 130 filter = kaiser_low_pass_filter(&arena, fp.kaiser.cutoff_frequency, fp.sampling_frequency, 131 fp.kaiser.beta, (i32)fp.kaiser.length); 132 f->length = (i32)fp.kaiser.length; 133 f->time_delay = (f32)f->length / 2.0f / fp.sampling_frequency; 134 }break; 135 case BeamformerFilterKind_MatchedChirp:{ 136 typeof(fp.matched_chirp) *mc = &fp.matched_chirp; 137 f32 fs = fp.sampling_frequency; 138 f->length = (i32)(mc->duration * fs); 139 if (fp.complex) { 140 filter = baseband_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1, 0.5f); 141 f->time_delay = complex_filter_first_moment(filter, f->length, fs); 142 } else { 143 filter = rf_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1); 144 f->time_delay = real_filter_first_moment(filter, f->length, fs); 145 } 146 }break; 147 InvalidDefaultCase; 148 } 149 150 f->parameters = fp; 151 152 u32 byte_size = f->length * (i32)sizeof(f32) * (fp.complex? 2 : 1); 153 if (f->buffer.size < byte_size) { 154 GPUBufferAllocateInfo allocate_info = { 155 .size = byte_size, 156 .flags = VulkanUsageFlag_HostReadWrite, 157 .label = str8_from_s8(label), 158 }; 159 vk_buffer_allocate(&f->buffer, &allocate_info); 160 } 161 vk_buffer_range_upload(&f->buffer, filter, 0, byte_size, 0); 162 } 163 164 function iv3 165 das_valid_points(iv3 points) 166 { 167 iv3 result; 168 result.x = Max(points.x, 1); 169 result.y = Max(points.y, 1); 170 result.z = Max(points.z, 1); 171 return result; 172 } 173 174 function void 175 update_hadamard(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena) 176 { 177 f16 *hadamard = make_hadamard_transpose(&arena, order, row_major); 178 if (hadamard) { 179 u64 offset = offsetof(BeamformerComputeArrayParameters, Hadamard); 180 u64 size = sizeof(*((BeamformerComputeArrayParameters *)0)->Hadamard) * order * order; 181 vk_buffer_range_upload(&cp->array_parameters, hadamard, offset, size, 0); 182 cp->hadamard_order = order; 183 } 184 } 185 186 function u64 187 beamformer_frame_byte_size(iv3 points, BeamformerDataKind kind) 188 { 189 u64 result = points.x * points.y * points.z * beamformer_data_kind_byte_size[kind]; 190 result = round_up_to(result, 64); 191 return result; 192 } 193 194 function BeamformerFrame * 195 beamformer_frame_next(BeamformerComputeContext *cc, iv3 output_points, b32 complex, u64 reserved_size) 196 { 197 BeamformerFrameBacklog *bl = &cc->backlog; 198 199 BeamformerDataKind kind = complex ? BeamformerDataKind_Float32Complex : BeamformerDataKind_Float32; 200 u64 frame_size = beamformer_frame_byte_size(output_points, kind); 201 202 // TODO(rnp): handle this somewhat gracefully (even it produces garbled output) 203 assert(frame_size + reserved_size <= (u64)bl->buffer->size); 204 205 if (bl->next_offset > (u64)bl->buffer->size - frame_size - reserved_size) 206 bl->next_offset = 0; 207 208 u64 id = bl->counter++; 209 210 BeamformerFrame *result = bl->frames + (id % countof(bl->frames)); 211 atomic_store_u64(&result->timeline_valid_value, -1ULL); 212 result->id = id & U32_MAX; 213 result->buffer_offset = bl->next_offset; 214 result->points = output_points; 215 result->data_kind = kind; 216 217 bl->next_offset += frame_size; 218 219 return result; 220 } 221 222 function void 223 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info) 224 { 225 u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer); 226 t->buffer[index] = info; 227 } 228 229 function uv3 230 layout_for_output(iv3 points) 231 { 232 uv3 result = {{1, 1, 1}}; 233 234 b32 has_x = points.x > 1; 235 b32 has_y = points.y > 1; 236 b32 has_z = points.z > 1; 237 238 u32 subgroup_size = vk_gpu_info()->subgroup_size; 239 u32 grid_3d_z_size = Max(1, subgroup_size / (4 * 4)); 240 u32 grid_2d_y_size = Max(1, subgroup_size / 8); 241 242 switch (iv3_dimension(points)) { 243 case 1:{ 244 if (has_x) result.x = subgroup_size; 245 if (has_y) result.y = subgroup_size; 246 if (has_z) result.z = subgroup_size; 247 }break; 248 249 case 2:{ 250 if (has_x && has_y) {result.x = 8; result.y = grid_2d_y_size;} 251 if (has_x && has_z) {result.x = 8; result.z = grid_2d_y_size;} 252 if (has_y && has_z) {result.y = 8; result.z = grid_2d_y_size;} 253 }break; 254 255 case 3:{result = (uv3){{4, 4, grid_3d_z_size}};}break; 256 257 InvalidDefaultCase; 258 } 259 260 return result; 261 } 262 263 function uv3 264 dispatch_for_output(uv3 layout, iv3 points) 265 { 266 uv3 result; 267 result.x = (u32)ceil_f32((f32)points.x / layout.x); 268 result.y = (u32)ceil_f32((f32)points.y / layout.y); 269 result.z = (u32)ceil_f32((f32)points.z / layout.z); 270 return result; 271 } 272 273 function b32 274 compute_plan_push_shader(BeamformerComputePlan *p, BeamformerComputeGraphNode *node, BeamformerShaderParameters *sp) 275 { 276 b32 result = 0; 277 if (p->pipeline.shader_count < countof(p->pipeline.shaders)) { 278 u32 index = p->pipeline.shader_count++; 279 p->pipeline.shaders[index] = node->kind; 280 zero_struct(p->shader_descriptors + index); 281 p->pipeline.parameters[index] = sp ? *sp : (BeamformerShaderParameters){0}; 282 283 p->shader_descriptors[index].input_data_kind = node->input_data_kind; 284 p->shader_descriptors[index].output_data_kind = node->output_data_kind; 285 286 result = 1; 287 } 288 return result; 289 } 290 291 function BeamformerComputeGraphNode * 292 push_compute_graph_node(BeamformerComputeGraph *graph, BeamformerShaderKind kind, Arena *arena) 293 { 294 BeamformerComputeGraphNode *result = push_struct(arena, BeamformerComputeGraphNode); 295 if (graph) { 296 DLLInsertLast(0, graph->first, graph->last, result, next, prev); 297 graph->count++; 298 } 299 result->kind = kind; 300 result->user_pipeline_index = -1; 301 // NOTE(rnp): initially don't care data kind 302 result->input_data_kind = BeamformerDataKind_Count; 303 result->output_data_kind = BeamformerDataKind_Count; 304 return result; 305 } 306 307 function void 308 plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb, Arena scratch) 309 { 310 b32 run_hilbert = 0; 311 b32 demodulate = 0; 312 313 for (u32 i = 0; i < pb->pipeline.shader_count; i++) { 314 switch (pb->pipeline.shaders[i]) { 315 case BeamformerShaderKind_Hilbert:{run_hilbert = 1;}break; 316 case BeamformerShaderKind_Demodulate:{demodulate = 1;}break; 317 default:{}break; 318 } 319 } 320 321 if (demodulate) run_hilbert = 0; 322 323 f32 sampling_frequency = pb->parameters.sampling_frequency; 324 u32 input_sample_count = pb->parameters.sample_count; 325 u32 acquisition_count = pb->parameters.acquisition_count; 326 u32 decimation_rate = Max(pb->parameters.decimation_rate, 1); 327 328 cp->raw_channel_byte_stride = pb->parameters.sample_count * pb->parameters.acquisition_count 329 * beamformer_data_kind_byte_size[pb->pipeline.data_kind]; 330 331 BeamformerDataKind input_data_kind = pb->pipeline.data_kind; 332 if (demodulate) { 333 switch (input_data_kind) { 334 case BeamformerDataKind_Int16:{ input_data_kind = BeamformerDataKind_Int16Complex; }break; 335 case BeamformerDataKind_Float16:{input_data_kind = BeamformerDataKind_Float16Complex;}break; 336 case BeamformerDataKind_Float32:{input_data_kind = BeamformerDataKind_Float32Complex;}break; 337 default:{}break; 338 } 339 input_sample_count /= (2 * decimation_rate); 340 sampling_frequency /= (2 * decimation_rate); 341 } 342 343 cp->iq_pipeline = beamformer_data_kind_complex[input_data_kind] || run_hilbert; 344 345 BeamformerDataKind das_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex 346 : BeamformerDataKind_Float32; 347 348 cp->channel_count = pb->parameters.channel_count; 349 u32 chunk_channel_count = Min(cp->channel_count, BeamformerChunkChannelCount); 350 351 cp->rf_size = input_sample_count * pb->parameters.acquisition_count * chunk_channel_count 352 * beamformer_data_kind_byte_size[das_data_kind]; 353 354 read_only local_persist BeamformerDataKind data_kind_to_element_kind[] = { 355 [BeamformerDataKind_Int16] = BeamformerDataKind_Float16, 356 [BeamformerDataKind_Float16] = BeamformerDataKind_Float16, 357 [BeamformerDataKind_Float32] = BeamformerDataKind_Float32, 358 [BeamformerDataKind_Int16Complex] = BeamformerDataKind_Float16, 359 [BeamformerDataKind_Float16Complex] = BeamformerDataKind_Float16, 360 [BeamformerDataKind_Float32Complex] = BeamformerDataKind_Float32, 361 }; 362 363 ////////////////////////////////////// 364 // NOTE(rnp): First Pass: build initial graph and insert hard layout constraints 365 BeamformerComputeGraph graph = {0}; 366 BeamformerComputeGraphNode *root_node = push_compute_graph_node(&graph, BeamformerShaderKind_Count, &scratch); 367 root_node->input_data_kind = input_data_kind; 368 root_node->input_stride.x = 1; // Sample Stride 369 root_node->input_stride.y = pb->parameters.sample_count * acquisition_count; // Channel Stride 370 root_node->input_stride.z = pb->parameters.sample_count; // Receive Event Stride 371 root_node->output_data_kind = input_data_kind; 372 root_node->output_stride.x = 1; // Sample Stride 373 root_node->output_stride.y = pb->parameters.sample_count * acquisition_count; // Channel Stride 374 root_node->output_stride.z = pb->parameters.sample_count; // Receive Event Stride 375 376 for EachIndex(pb->pipeline.shader_count, it) { 377 // NOTE(rnp): skip unnecessary shaders 378 switch (pb->pipeline.shaders[it]) { 379 case BeamformerShaderKind_Hilbert:{if (!run_hilbert) continue;}break; 380 381 case BeamformerShaderKind_Decode:{ 382 if (pb->parameters.decode_mode == BeamformerDecodeMode_None) 383 continue; 384 }break; 385 386 case BeamformerShaderKind_Sum: 387 case BeamformerShaderKind_MinMax: 388 { 389 // NOTE(rnp): currently unsupported 390 continue; 391 }break; 392 393 default:{}break; 394 } 395 396 BeamformerComputeGraphNode *node = push_compute_graph_node(&graph, pb->pipeline.shaders[it], &scratch); 397 node->user_pipeline_index = (i32)it; 398 switch (pb->pipeline.shaders[it]) { 399 case BeamformerShaderKind_Decode:{ 400 b32 low_precision = beamformer_data_kind_element_size[input_data_kind] < 4; 401 b32 use_coop_matrix = vk_gpu_info()->cooperative_matrix && 402 low_precision && 403 (acquisition_count % 16 == 0) && 404 (chunk_channel_count % 16 == 0); 405 406 // NOTE(rnp): fixed input layout required for reasonable performance 407 if (low_precision && beamformer_data_kind_complex[input_data_kind]) 408 node->input_data_kind = BeamformerDataKind_Float16Complex; 409 node->input_stride.x = chunk_channel_count * acquisition_count; 410 node->input_stride.y = acquisition_count; 411 node->input_stride.z = 1; 412 413 if (use_coop_matrix) { 414 node->input_data_kind = BeamformerDataKind_Float16; 415 node->output_data_kind = data_kind_to_element_kind[das_data_kind]; 416 node->output_stride = node->input_stride; 417 } 418 }break; 419 420 case BeamformerShaderKind_DAS:{ 421 node->input_data_kind = das_data_kind; 422 node->input_stride.x = 1; // Sample Stride 423 node->input_stride.y = input_sample_count * acquisition_count; // Channel Stride 424 node->input_stride.z = input_sample_count; // Receive Event Stride 425 node->output_stride.x = 1; 426 node->output_stride.y = cp->output_points.x; 427 node->output_stride.z = cp->output_points.x * cp->output_points.y; 428 node->output_data_kind = cp->iq_pipeline ? BeamformerDataKind_Float32Complex 429 : BeamformerDataKind_Float32; 430 431 // NOTE(rnp): insert implicit CoherencyWeighting node 432 if (pb->parameters.coherency_weighting) 433 node = push_compute_graph_node(&graph, BeamformerShaderKind_CoherencyWeighting, &scratch); 434 }break; 435 436 default:{}break; 437 } 438 } 439 440 ////////////////////////////////////// 441 // NOTE(rnp): Second Pass: resolve layout constraints 442 for (BeamformerComputeGraphNode *node = root_node->next; node; node = node->next) { 443 b32 needs_reshape = 0; 444 445 // NOTE(rnp): data strides 446 { 447 b32 input_dont_care = bv3_any(iv3_equal(node->input_stride, (iv3){0})); 448 b32 prev_output_dont_care = bv3_any(iv3_equal(node->prev->output_stride, (iv3){0})); 449 450 if (prev_output_dont_care && !input_dont_care) 451 node->prev->output_stride = node->input_stride; 452 453 if (!prev_output_dont_care && input_dont_care) 454 node->input_stride = node->prev->output_stride; 455 456 if (prev_output_dont_care && input_dont_care) 457 node->input_stride = node->prev->output_stride = node->prev->input_stride; 458 459 needs_reshape |= !bv3_all(iv3_equal(node->input_stride, node->prev->output_stride)); 460 } 461 462 // NOTE(rnp): data kinds 463 { 464 b32 input_dont_care = node->input_data_kind == BeamformerDataKind_Count; 465 b32 prev_output_dont_care = node->prev->output_data_kind == BeamformerDataKind_Count; 466 467 if (prev_output_dont_care && !input_dont_care) 468 node->prev->output_data_kind = node->input_data_kind; 469 470 if (!prev_output_dont_care && input_dont_care) 471 node->input_data_kind = node->prev->output_data_kind; 472 473 if (prev_output_dont_care && input_dont_care) 474 node->input_data_kind = node->prev->output_data_kind = node->prev->input_data_kind; 475 476 needs_reshape |= node->input_data_kind != node->prev->output_data_kind; 477 } 478 479 // NOTE(rnp): insert reshape if needed 480 if (needs_reshape) { 481 BeamformerComputeGraphNode *new = push_compute_graph_node(0, BeamformerShaderKind_Reshape, &scratch); 482 BeamformerComputeGraphNode *last = node->prev; 483 DLLInsertLast(0, node, last, new, next, prev); 484 graph.count++; 485 new->input_data_kind = new->prev->output_data_kind; 486 new->input_stride = new->prev->output_stride; 487 new->output_data_kind = new->next->input_data_kind; 488 new->output_stride = new->next->input_stride; 489 } 490 } 491 492 f32 time_offset = pb->parameters.time_offset; 493 u32 subgroup_size = vk_gpu_info()->subgroup_size; 494 495 cp->first_image_shader_index = 0; 496 cp->pipeline.shader_count = 0; 497 498 for (BeamformerComputeGraphNode *node = root_node->next; node; node = node->next) { 499 assert(node->prev->output_data_kind == node->input_data_kind); 500 assert(bv3_all(iv3_equal(node->prev->output_stride, node->input_stride))); 501 502 BeamformerShaderParameters *sp = 0; 503 if (node->user_pipeline_index >= 0) 504 sp = pb->pipeline.parameters + node->user_pipeline_index; 505 506 if (compute_plan_push_shader(cp, node, sp)) { 507 BeamformerShaderDescriptor *sd = cp->shader_descriptors + cp->pipeline.shader_count - 1; 508 509 switch (node->kind) { 510 case BeamformerShaderKind_Decode:{ 511 BeamformerDecodeBakeParameters *db = &sd->bake.Decode; 512 513 u32 decode_sample_count = input_sample_count; 514 db->decode_mode = pb->parameters.decode_mode; 515 db->transmit_count = pb->parameters.acquisition_count; 516 db->chunk_channel_count = chunk_channel_count; 517 518 // NOTE(rnp): ignored when using coop matrices 519 db->output_sample_stride = node->output_stride.x; 520 db->output_channel_stride = node->output_stride.y; 521 db->output_transmit_stride = node->output_stride.z; 522 523 db->to_process = 1; 524 525 b32 use_coop_matrix = vk_gpu_info()->cooperative_matrix && 526 node->input_data_kind == BeamformerDataKind_Float16 && 527 (db->transmit_count % 16 == 0) && 528 (chunk_channel_count % 16 == 0); 529 if (use_coop_matrix) { 530 // TODO(rnp): shared memory for larger sizes 531 sd->layout = (uv3){{subgroup_size, 1, 1}}; 532 533 if (demodulate) 534 decode_sample_count *= 2; 535 536 db->cooperative_matrix = 1; 537 db->cooperative_matrix_m = 16; 538 db->cooperative_matrix_n = 16; 539 db->cooperative_matrix_k = 16; 540 541 sd->dispatch.x = db->transmit_count / db->cooperative_matrix_n; 542 sd->dispatch.y = chunk_channel_count / db->cooperative_matrix_m; 543 sd->dispatch.z = decode_sample_count; 544 } else if (db->transmit_count > 40) { 545 db->use_shared_memory = 1; 546 547 if (db->transmit_count == 48) 548 db->to_process = db->transmit_count / 16; 549 550 b32 use_16x = db->transmit_count == 48 || db->transmit_count == 80 || 551 db->transmit_count == 96 || db->transmit_count == 160; 552 sd->layout.x = use_16x ? 16 : 32; 553 sd->layout.y = 4; 554 sd->layout.z = 1; 555 556 sd->dispatch.x = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.x / (f32)db->to_process); 557 sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y); 558 sd->dispatch.z = (u32)ceil_f32((f32)decode_sample_count / (f32)sd->layout.z); 559 } else { 560 /* NOTE(rnp): register caching. using more threads will cause the compiler to do 561 * contortions to avoid spilling registers. using less gives higher performance */ 562 sd->layout = (uv3){{subgroup_size / 2, 1, 1}}; 563 564 sd->dispatch.x = (u32)ceil_f32((f32)decode_sample_count / (f32)sd->layout.x); 565 sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y); 566 sd->dispatch.z = 1; 567 } 568 }break; 569 570 case BeamformerShaderKind_Demodulate: 571 case BeamformerShaderKind_Filter: 572 { 573 b32 demod = node->kind == BeamformerShaderKind_Demodulate; 574 BeamformerFilter *f = cp->filters + sp->filter_slot; 575 576 time_offset += f->time_delay; 577 578 BeamformerFilterBakeParameters *fb = &sd->bake.Filter; 579 fb->filter_length = (u32)f->length; 580 fb->demodulate = demod; 581 fb->complex_filter = f->parameters.complex; 582 583 fb->sample_count = input_sample_count; 584 fb->decimation_rate = demod ? decimation_rate : 1; 585 586 b32 deinterleave = beamformer_data_kind_complex[node->input_data_kind] && 587 !beamformer_data_kind_complex[node->output_data_kind]; 588 if (deinterleave) 589 fb->batch_sample_count = chunk_channel_count * input_sample_count * pb->parameters.acquisition_count; 590 591 fb->output_sample_stride = node->output_stride.x; 592 fb->output_channel_stride = node->output_stride.y; 593 fb->output_transmit_stride = node->output_stride.z; 594 595 fb->input_sample_stride = node->input_stride.x; 596 fb->input_channel_stride = node->input_stride.y; 597 fb->input_transmit_stride = node->input_stride.z; 598 599 /* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating 600 * between sampling the I portion and the Q portion of an IQ signal. Therefore there 601 * is an implicit decimation factor of 2 which must always be included. All code here 602 * assumes that the signal was sampled in such a way that supports this operation. 603 * To recover IQ[n] from the sampled data (RF[n]) we do the following: 604 * I[n] = RF[n] 605 * Q[n] = RF[n + 1] 606 * IQ[n] = I[n] - j*Q[n] 607 */ 608 if (demod) { 609 fb->demodulation_frequency = pb->parameters.demodulation_frequency; 610 fb->sampling_frequency = pb->parameters.sampling_frequency / 2; 611 } 612 613 sd->layout = (uv3){{subgroup_size, 1, 1}}; 614 sd->dispatch.x = (u32)ceil_f32((f32)input_sample_count / (f32)sd->layout.x); 615 sd->dispatch.y = (u32)ceil_f32((f32)chunk_channel_count / (f32)sd->layout.y); 616 sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); 617 }break; 618 619 case BeamformerShaderKind_DAS:{ 620 cp->first_image_shader_index = cp->pipeline.shader_count; 621 622 BeamformerDASBakeParameters *db = &sd->bake.DAS; 623 db->sampling_frequency = sampling_frequency; 624 db->demodulation_frequency = pb->parameters.demodulation_frequency; 625 db->speed_of_sound = pb->parameters.speed_of_sound; 626 db->time_offset = time_offset; 627 db->f_number = pb->parameters.f_number; 628 db->acquisition_kind = pb->parameters.acquisition_kind; 629 db->sample_count = input_sample_count; 630 db->channel_count = pb->parameters.channel_count; 631 db->acquisition_count = pb->parameters.acquisition_count; 632 db->chunk_channel_count = chunk_channel_count; 633 db->interpolation_mode = pb->parameters.interpolation_mode; 634 db->transmit_angle = pb->parameters.focal_vector.E[0]; 635 db->focus_depth = pb->parameters.focal_vector.E[1]; 636 db->transmit_receive_orientation = pb->parameters.transmit_receive_orientation; 637 638 // NOTE(rnp): old gcc will miscompile an assignment 639 mem_copy(cp->xdc_transform.E, pb->parameters.xdc_transform.E, sizeof(cp->xdc_transform)); 640 641 cp->voxel_transform = m4_mul(cp->ui_voxel_transform, pb->parameters.das_voxel_transform); 642 cp->xdc_element_pitch = pb->parameters.xdc_element_pitch; 643 644 u32 id = pb->parameters.acquisition_kind; 645 if (id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_FORCES) 646 cp->voxel_transform = m4_mul(cp->xdc_transform, cp->voxel_transform); 647 648 db->sparse = id == BeamformerAcquisitionKind_UFORCES || id == BeamformerAcquisitionKind_UHERCULES; 649 db->single_focus = pb->parameters.single_focus; 650 db->single_orientation = pb->parameters.single_orientation; 651 db->coherency_weighting = pb->parameters.coherency_weighting; 652 653 sd->layout = layout_for_output(cp->output_points); 654 sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); 655 }break; 656 657 case BeamformerShaderKind_CoherencyWeighting:{ 658 sd->layout = layout_for_output(cp->output_points); 659 sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); 660 }break; 661 662 case BeamformerShaderKind_Reshape:{ 663 BeamformerReshapeBakeParameters *rb = &sd->bake.Reshape; 664 rb->deinterleave = beamformer_data_kind_complex[node->input_data_kind] && 665 !beamformer_data_kind_complex[node->output_data_kind]; 666 rb->interleave = !beamformer_data_kind_complex[node->input_data_kind] && 667 beamformer_data_kind_complex[node->output_data_kind]; 668 assert(rb->interleave == 0 || (rb->interleave != rb->deinterleave)); 669 670 rb->input_stride_x = node->input_stride.x; 671 rb->input_stride_y = node->input_stride.y; 672 rb->input_stride_z = node->input_stride.z; 673 rb->output_stride_x = node->output_stride.x; 674 rb->output_stride_y = node->output_stride.y; 675 rb->output_stride_z = node->output_stride.z; 676 677 // NOTE(rnp): order doesn't really matter here but it must match the dispatch layout 678 rb->size_x = input_sample_count; 679 rb->size_y = chunk_channel_count; 680 rb->size_z = acquisition_count; 681 682 sd->layout.x = 1; 683 sd->layout.z = Min(subgroup_size, rb->size_z); 684 sd->layout.y = subgroup_size / sd->layout.z; 685 686 sd->dispatch.x = (u32)(ceil_f32((f32)rb->size_x / sd->layout.x)); 687 sd->dispatch.y = (u32)(ceil_f32((f32)rb->size_y / sd->layout.y)); 688 sd->dispatch.z = (u32)(ceil_f32((f32)rb->size_z / sd->layout.z)); 689 }break; 690 691 default:{}break; 692 693 #if 0 694 case BeamformerShaderKind_Sum:{ 695 sd->bake.data_kind = BeamformerDataKind_Float32; 696 if (cp->iq_pipeline) 697 sd->bake.data_kind = BeamformerDataKind_Float32Complex; 698 699 sd->layout = layout_for_output(cp->output_points); 700 sd->dispatch = dispatch_for_output(sd->layout, cp->output_points); 701 702 commit = 1; 703 }break; 704 #endif 705 706 } 707 } 708 } 709 710 cp->pipeline.data_kind = input_data_kind; 711 712 if (cp->first_image_shader_index == 0) 713 cp->first_image_shader_index = cp->pipeline.shader_count; 714 } 715 716 function void 717 stream_append_shader_header(Stream *s, i32 reloadable_index, BeamformerShaderDescriptor *sd, uv3 layout) 718 { 719 stream_append_s8s(s, s8("#version 460 core\n\n" 720 "#extension GL_EXT_buffer_reference : require\n" 721 "#extension GL_EXT_shader_16bit_storage : require\n" 722 "#extension GL_EXT_shader_explicit_arithmetic_types : require\n\n" 723 "#define f32 float32_t\n" 724 "#define f16 float16_t\n" 725 "#define s32 int32_t\n" 726 "#define u64 uint64_t\n" 727 "#define u32 uint32_t\n" 728 "#define s16 int16_t\n" 729 "#define u16 uint16_t\n" 730 "#define s32vec2 i32vec2\n" 731 "#define s16vec2 i16vec2\n" 732 "\n")); 733 734 i32 header_vector_length = beamformer_shader_header_vector_lengths[reloadable_index]; 735 i32 *header_vector = beamformer_shader_header_vectors[reloadable_index]; 736 for (i32 index = 0; index < header_vector_length; index++) 737 stream_append_s8(s, beamformer_shader_global_header_strings[header_vector[index]]); 738 739 if (layout.x != 0) { 740 stream_append_s8(s, s8("layout(local_size_x = ")); 741 stream_append_u64(s, layout.x); 742 stream_append_s8(s, s8(", local_size_y = ")); 743 stream_append_u64(s, layout.y); 744 stream_append_s8(s, s8(", local_size_z = ")); 745 stream_append_u64(s, layout.z); 746 stream_append_s8(s, s8(") in;\n\n")); 747 } 748 749 { 750 u32 max_length = 0; 751 for EachElement(beamformer_data_kind_s8, it) 752 max_length = Max(max_length, (u32)beamformer_data_kind_s8[it].len); 753 754 for EachElement(beamformer_data_kind_s8, it) { 755 stream_append_s8s(s, s8("#define DataKind_"), beamformer_data_kind_s8[it]); 756 stream_pad(s, ' ', max_length - beamformer_data_kind_s8[it].len + 1); 757 stream_append_u64(s, it); 758 stream_append_byte(s, '\n'); 759 } 760 stream_append_byte(s, '\n'); 761 } 762 763 if (sd) { 764 BeamformerDataKind data_kinds[] = {sd->input_data_kind, sd->output_data_kind}; 765 s8 line_prefixes[] = {s8_comp("Input"), s8_comp("Output")}; 766 for EachElement(data_kinds, it) { 767 if (data_kinds[it] != BeamformerDataKind_Count) { 768 stream_append_s8s(s, s8("#define "), line_prefixes[it], s8("DataType "), 769 beamformer_data_kind_glsl_type[data_kinds[it]], 770 s8("\n#define "), line_prefixes[it], s8("DataKind DataKind_"), 771 beamformer_data_kind_s8[data_kinds[it]], 772 s8("\n#define "), line_prefixes[it], s8("DataKindByteSize ")); 773 stream_append_u64(s, beamformer_data_kind_byte_size[data_kinds[it]]); 774 stream_append_byte(s, '\n'); 775 } 776 } 777 stream_append_byte(s, '\n'); 778 779 u32 *parameters = (u32 *)&sd->bake; 780 s8 *names = beamformer_shader_bake_parameter_names[reloadable_index]; 781 u32 float_bits = beamformer_shader_bake_parameter_float_bits[reloadable_index]; 782 i32 count = beamformer_shader_bake_parameter_counts[reloadable_index]; 783 784 for (i32 index = 0; index < count; index++) { 785 stream_append_s8s(s, s8("#define "), names[index], 786 (float_bits & (1 << index))? s8(" uintBitsToFloat") : s8(" "), s8("(0x")); 787 stream_append_hex_u64(s, parameters[index]); 788 stream_append_s8(s, s8(")\n")); 789 } 790 } 791 792 if (!renderdoc_attached()) 793 stream_append_s8(s, s8("\n\n#line 1\n")); 794 } 795 796 function void 797 beamformer_reload_pipeline(VulkanHandle *pipeline, BeamformerShaderReloadInfo *sris, u32 count, Arena arena) 798 { 799 assume(count <= 2); 800 s8 paths[2]; 801 VulkanPipelineCreateInfo infos[2]; 802 803 if (!BakeShaders) { 804 for (u32 i = 0; i < count; i++) 805 paths[i] = push_s8_from_parts(&arena, os_path_separator(), s8("shaders"), sris[i].filename_or_data); 806 } 807 808 u32 push_constants_size = 0; 809 for (u32 i = 0; i < count; i++) { 810 Stream shader_stream = arena_stream(arena); 811 i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[sris[i].shader]; 812 if (i == 0) push_constants_size = beamformer_shader_push_constant_sizes[reloadable_index]; 813 else assert(push_constants_size == beamformer_shader_push_constant_sizes[reloadable_index]); 814 815 stream_append_shader_header(&shader_stream, reloadable_index, sris[i].shader_descriptor, sris[i].layout); 816 817 if (BakeShaders) { 818 stream_append_s8(&shader_stream, sris[i].filename_or_data); 819 } else { 820 shader_stream.widx += os_read_entire_file((c8 *)paths[i].data, 821 shader_stream.data + shader_stream.widx, 822 shader_stream.cap - shader_stream.widx); 823 } 824 825 infos[i].kind = sris[i].shader_kind; 826 infos[i].text = arena_stream_commit_zero(&arena, &shader_stream); 827 infos[i].name = beamformer_shader_names[sris[i].shader]; 828 829 //s8 line = s8("---------------\n"); 830 //s8 nl = s8("\n"); 831 //os_console_log(line.data, line.len); 832 //os_console_log(infos[i].name.data, infos[i].name.len); 833 //os_console_log(nl.data, nl.len); 834 //os_console_log(line.data, line.len); 835 //os_console_log(infos[i].text.data, infos[i].text.len); 836 //os_console_log(line.data, line.len); 837 } 838 839 vk_pipeline_release(*pipeline); 840 *pipeline = vk_pipeline(infos, count, push_constants_size); 841 } 842 843 function void 844 beamformer_reload_render_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, Arena arena) 845 { 846 i32 index = beamformer_shader_reloadable_index_by_shader[shader]; 847 BeamformerShaderReloadInfo infos[2] = { 848 { 849 .shader = shader, 850 .shader_kind = beamformer_shader_primitive_is_vertex[index] ? VulkanShaderKind_Vertex : VulkanShaderKind_Mesh, 851 .filename_or_data = BakeShaders ? beamformer_shader_data[index][0] 852 : beamformer_reloadable_shader_files[index][0], 853 }, 854 { 855 .shader = shader, 856 .shader_kind = VulkanShaderKind_Fragment, 857 .filename_or_data = BakeShaders ? beamformer_shader_data[index][1] 858 : beamformer_reloadable_shader_files[index][1], 859 }, 860 }; 861 beamformer_reload_pipeline(pipeline, infos, countof(infos), arena); 862 } 863 864 function void 865 beamformer_reload_compute_pipeline(VulkanHandle *pipeline, BeamformerShaderKind shader, 866 BeamformerShaderDescriptor *shader_descriptor, Arena arena) 867 { 868 i32 index = beamformer_shader_reloadable_index_by_shader[shader]; 869 uv3 layout = shader_descriptor ? shader_descriptor->layout : (uv3){{vk_gpu_info()->subgroup_size, 1, 1}}; 870 BeamformerShaderReloadInfo info = { 871 .shader = shader, 872 .shader_kind = VulkanShaderKind_Compute, 873 .shader_descriptor = shader_descriptor, 874 .filename_or_data = BakeShaders ? beamformer_shader_data[index][0] 875 : beamformer_reloadable_shader_files[index][0], 876 .layout = layout, 877 }; 878 beamformer_reload_pipeline(pipeline, &info, 1, arena); 879 } 880 881 function void 882 beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena) 883 { 884 BeamformerParameterBlock *pb = beamformer_parameter_block_lock(ctx->shared_memory, block, -1); 885 for EachBit(pb->region_update_flags, region) { 886 switch (region) { 887 case BeamformerParameterRegionFlag_NotifyUI:{ 888 atomic_store_u32(&ctx->ui_dirty_parameter_blocks, 1u << block); 889 }break; 890 891 case BeamformerParameterRegionFlag_ComputePipeline: 892 case BeamformerParameterRegionFlag_Parameters: 893 { 894 cp->output_points = das_valid_points(pb->parameters.output_points.xyz); 895 cp->average_frames = pb->parameters.output_points.E[3]; 896 897 plan_compute_pipeline(cp, pb, arena); 898 899 /* NOTE(rnp): these are both handled by plan_compute_pipeline() */ 900 u32 mask = 1 << BeamformerParameterBlockRegion_ComputePipeline | 901 1 << BeamformerParameterBlockRegion_Parameters; 902 pb->region_update_flags &= ~mask; 903 904 for (u32 shader_slot = 0; shader_slot < cp->pipeline.shader_count; shader_slot++) { 905 u128 hash = u128_hash_from_data(cp->shader_descriptors + shader_slot, sizeof(BeamformerShaderDescriptor)); 906 if (!u128_equal(hash, cp->shader_hashes[shader_slot])) 907 cp->dirty_programs |= 1 << shader_slot; 908 cp->shader_hashes[shader_slot] = hash; 909 } 910 911 cp->acquisition_count = pb->parameters.acquisition_count; 912 cp->acquisition_kind = pb->parameters.acquisition_kind; 913 914 i64 buffer_size = PING_PONG_BUFFER_SLOTS * round_up_to(cp->rf_size, 64); 915 if (ctx->compute_context.ping_pong_buffer.size < buffer_size) { 916 b32 cuda = cuda_supported(); 917 GPUBufferAllocateInfo allocate_info = { 918 .size = buffer_size, 919 .export = cuda ? &ctx->compute_context.ping_pong_export_handle : 0, 920 .label = str8("PingPongBuffer"), 921 }; 922 vk_buffer_allocate(&ctx->compute_context.ping_pong_buffer, &allocate_info); 923 924 BeamformerShaderResourceInfo shader_resource_infos[] = { 925 { 926 .kind = BeamformerShaderResourceKind_Buffer, 927 .handle = ctx->compute_context.ping_pong_buffer.handle, 928 .slot = BeamformerShaderBufferSlot_PingPong, 929 }, 930 }; 931 vk_bind_shader_resources(shader_resource_infos, countof(shader_resource_infos)); 932 933 // TODO(rnp): figure out how to share with CUDA 934 // IMPORTANT: on linux the handle is returned to os and should be cleared after import 935 // see usage of glImportMemoryFdEXT and surrounding code in ui.c for examples 936 if (cuda) { 937 } 938 } 939 940 if (cp->hadamard_order != (i32)cp->acquisition_count) 941 update_hadamard(cp, (i32)cp->acquisition_count, vk_gpu_info()->cooperative_matrix, arena); 942 }break; 943 944 case BeamformerParameterBlockRegion_ChannelMapping:{ 945 cuda_set_channel_mapping(pb->channel_mapping); 946 }break; 947 case BeamformerParameterRegionFlag_TransmitReceiveOrientations:{ 948 GPUBuffer *b = &cp->array_parameters; 949 u32 kind = BeamformerComputeArrayParameterKind_TransmitReceiveOrientations; 950 u64 offset = beamformer_compute_array_parameter_offsets[kind]; 951 u64 size = beamformer_compute_array_parameter_sizes[kind]; 952 { 953 Arena scratch = arena; 954 u16 *u16s = push_array(&scratch, u16, countof(pb->transmit_receive_orientations)); 955 for (u32 i = 0; i < countof(pb->transmit_receive_orientations); i++) 956 u16s[i] = pb->transmit_receive_orientations[i]; 957 958 vk_buffer_range_upload(b, u16s, offset, size, 0); 959 } 960 }break; 961 case BeamformerParameterRegionFlag_FocalVectors: 962 case BeamformerParameterRegionFlag_SparseElements: 963 { 964 u32 kind = BeamformerComputeArrayParameterKind_Count; 965 switch (region) { 966 case BeamformerParameterBlockRegion_FocalVectors:{ 967 kind = BeamformerComputeArrayParameterKind_FocalVectors; 968 }break; 969 case BeamformerParameterBlockRegion_SparseElements:{ 970 kind = BeamformerComputeArrayParameterKind_SparseElements; 971 }break; 972 InvalidDefaultCase; 973 } 974 975 if (kind != BeamformerComputeArrayParameterKind_Count) { 976 GPUBuffer *b = &cp->array_parameters; 977 u64 offset = beamformer_compute_array_parameter_offsets[kind]; 978 u64 size = beamformer_compute_array_parameter_sizes[kind]; 979 vk_buffer_range_upload(b, (u8 *)pb + BeamformerParameterBlockRegionOffsets[region], offset, size, 0); 980 } 981 }break; 982 } 983 } 984 beamformer_parameter_block_unlock(ctx->shared_memory, block); 985 } 986 987 function void 988 do_compute_shader(BeamformerCtx *ctx, VulkanHandle cmd, BeamformerComputePlan *cp, BeamformerFrame *frame, 989 u32 shader_slot, u32 channel_offset, u64 rf_pointer, Arena arena) 990 { 991 BeamformerComputeContext *cc = &ctx->compute_context; 992 993 u32 output_index = !cc->ping_pong_input_index; 994 u32 input_index = cc->ping_pong_input_index; 995 u32 das_output_index = PING_PONG_BUFFER_SLOTS - 1; 996 997 u64 pp_size = cc->ping_pong_buffer.size / PING_PONG_BUFFER_SLOTS; 998 u64 pp_input_pointer = cc->ping_pong_buffer.gpu_pointer + input_index * pp_size; 999 u64 pp_output_pointer = cc->ping_pong_buffer.gpu_pointer + output_index * pp_size; 1000 u64 pp_das_pointer = cc->ping_pong_buffer.gpu_pointer + das_output_index * pp_size; 1001 1002 u32 das_index = cp->first_image_shader_index - 1; 1003 1004 uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch; 1005 1006 vk_command_bind_pipeline(cmd, cp->vulkan_pipelines[shader_slot]); 1007 1008 switch (cp->pipeline.shaders[shader_slot]) { 1009 1010 case BeamformerShaderKind_Decode:{ 1011 BeamformerDecodePushConstants pc = { 1012 .hadamard_buffer = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, Hadamard), 1013 .rf_buffer = pp_input_pointer, 1014 }; 1015 1016 if ((shader_slot + 1) == das_index) pc.output_buffer = pp_das_pointer; 1017 else pc.output_buffer = pp_output_pointer; 1018 1019 GPUMemoryBarrierInfo memory_barriers[]= { 1020 // NOTE(rnp): first pass or last stage output 1021 { 1022 .gpu_buffer = &cc->ping_pong_buffer, 1023 .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, 1024 .size = pp_size, 1025 }, 1026 // NOTE(rnp): output for DAS 1027 { 1028 .gpu_buffer = &cc->ping_pong_buffer, 1029 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1030 .size = pp_size, 1031 }, 1032 }; 1033 1034 u32 barrier_count = 1; 1035 if (shader_slot + 1 == das_index) 1036 barrier_count++; 1037 1038 vk_command_buffer_memory_barriers(cmd, memory_barriers, barrier_count); 1039 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1040 vk_command_dispatch_compute(cmd, dispatch); 1041 1042 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1043 }break; 1044 1045 case BeamformerShaderKind_Hilbert:{ 1046 cuda_hilbert(input_index, output_index); 1047 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1048 }break; 1049 1050 case BeamformerShaderKind_Filter: 1051 case BeamformerShaderKind_Demodulate: 1052 { 1053 BeamformerDataKind output_data_kind = cp->shader_descriptors[shader_slot].output_data_kind; 1054 1055 u64 element_size = beamformer_data_kind_byte_size[output_data_kind]; 1056 u32 filter_slot = cp->pipeline.parameters[shader_slot].filter_slot; 1057 BeamformerFilterPushConstants pc = { 1058 .filter_coefficients = cp->filters[filter_slot].buffer.gpu_pointer, 1059 .input_data = shader_slot == 0 ? rf_pointer : pp_input_pointer, 1060 .output_element_offset = output_index * pp_size / element_size, 1061 }; 1062 1063 if ((shader_slot + 1) == das_index) 1064 pc.output_element_offset = das_output_index * pp_size / element_size; 1065 1066 GPUMemoryBarrierInfo memory_barriers[] = { 1067 // NOTE(rnp): last stage output 1068 { 1069 .gpu_buffer = &cc->ping_pong_buffer, 1070 .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, 1071 .size = pp_size, 1072 }, 1073 // NOTE(rnp): output for DAS 1074 { 1075 .gpu_buffer = &cc->ping_pong_buffer, 1076 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1077 .size = pp_size, 1078 }, 1079 }; 1080 GPUMemoryBarrierInfo *barriers = memory_barriers; 1081 1082 u32 barrier_count = 2; 1083 if (shader_slot == 0) { 1084 barriers++; 1085 barrier_count--; 1086 } 1087 1088 if ((shader_slot + 1) != das_index) 1089 barrier_count--; 1090 1091 if (barrier_count) 1092 vk_command_buffer_memory_barriers(cmd, barriers, barrier_count); 1093 1094 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1095 vk_command_dispatch_compute(cmd, dispatch); 1096 1097 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1098 }break; 1099 1100 case BeamformerShaderKind_DAS:{ 1101 local_persist u32 das_cycle_t = 0; 1102 1103 GPUBuffer *b = cc->backlog.buffer; 1104 1105 u64 frame_size = beamformer_frame_byte_size(frame->points, frame->data_kind); 1106 u64 iframe_size = frame_size / beamformer_data_kind_element_count[frame->data_kind]; 1107 u64 element_size = beamformer_data_kind_byte_size[cp->shader_descriptors[shader_slot].input_data_kind]; 1108 1109 BeamformerDASPushConstants pc = { 1110 .xdc_element_pitch = cp->xdc_element_pitch, 1111 .rf_element_offset = das_output_index * pp_size / element_size, 1112 .output_frame = b->gpu_pointer + frame->buffer_offset, 1113 .incoherent_frame = b->gpu_pointer + b->size - iframe_size, 1114 .output_size_x = cp->output_points.x, 1115 .output_size_y = cp->output_points.y, 1116 .output_size_z = cp->output_points.z, 1117 .cycle_t = das_cycle_t++, 1118 .channel_offset = channel_offset, 1119 .array_parameters = cp->array_parameters.gpu_pointer + offsetof(BeamformerComputeArrayParameters, FocalVectors), 1120 }; 1121 mem_copy(pc.voxel_transform.E, cp->voxel_transform.E, sizeof(pc.voxel_transform)); 1122 mem_copy(pc.xdc_transform.E, cp->xdc_transform.E, sizeof(pc.xdc_transform)); 1123 1124 b32 coherent = cp->shader_descriptors[shader_slot].bake.DAS.coherency_weighting; 1125 1126 GPUMemoryBarrierInfo memory_barriers[] = { 1127 // NOTE(rnp): last stage data output barrier 1128 { 1129 .gpu_buffer = &cc->ping_pong_buffer, 1130 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1131 .size = pp_size, 1132 }, 1133 // NOTE(rnp): output clearing pipeline barriers or last DAS pipeline write barriers 1134 { 1135 .gpu_buffer = b, 1136 .offset = frame->buffer_offset, 1137 .size = frame_size, 1138 }, 1139 { 1140 .gpu_buffer = b, 1141 .offset = pc.incoherent_frame - b->gpu_pointer, 1142 .size = iframe_size, 1143 }, 1144 }; 1145 1146 u32 barrier_count = countof(memory_barriers); 1147 if (!coherent) barrier_count--; 1148 1149 vk_command_buffer_memory_barriers(cmd, memory_barriers, barrier_count); 1150 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1151 vk_command_dispatch_compute(cmd, dispatch); 1152 }break; 1153 1154 case BeamformerShaderKind_CoherencyWeighting:{ 1155 GPUBuffer *b = cc->backlog.buffer; 1156 1157 u64 frame_size = beamformer_frame_byte_size(frame->points, frame->data_kind); 1158 u64 iframe_size = frame_size / beamformer_data_kind_element_count[frame->data_kind]; 1159 1160 BeamformerCoherencyWeightingPushConstants pc = { 1161 .left_side_buffer = b->gpu_pointer + frame->buffer_offset, 1162 .right_side_buffer = b->gpu_pointer + b->size - iframe_size, 1163 .scale = 1.0f, 1164 .output_size_x = cp->output_points.x, 1165 .output_size_y = cp->output_points.y, 1166 .output_size_z = cp->output_points.z, 1167 }; 1168 1169 GPUMemoryBarrierInfo memory_barriers[] = { 1170 { 1171 .gpu_buffer = b, 1172 .offset = frame->buffer_offset, 1173 .size = frame_size, 1174 }, 1175 { 1176 .gpu_buffer = b, 1177 .offset = pc.right_side_buffer - b->gpu_pointer, 1178 .size = iframe_size, 1179 }, 1180 }; 1181 1182 vk_command_buffer_memory_barriers(cmd, memory_barriers, countof(memory_barriers)); 1183 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1184 vk_command_dispatch_compute(cmd, dispatch); 1185 }break; 1186 1187 case BeamformerShaderKind_Reshape:{ 1188 BeamformerDataKind input_data_kind = cp->shader_descriptors[shader_slot].input_data_kind; 1189 BeamformerReshapeBakeParameters *rb = &cp->shader_descriptors[shader_slot].bake.Reshape; 1190 u64 input_pointer = shader_slot == 0 ? rf_pointer : pp_input_pointer; 1191 BeamformerReshapePushConstants pc = { 1192 .left_input_buffer = input_pointer, 1193 .right_input_buffer = input_pointer + rb->size_x * rb->size_y * rb->size_z 1194 * beamformer_data_kind_byte_size[input_data_kind], 1195 }; 1196 1197 if ((shader_slot + 1) == das_index) pc.output_buffer = pp_das_pointer; 1198 else pc.output_buffer = pp_output_pointer; 1199 1200 GPUMemoryBarrierInfo memory_barriers[]= { 1201 // NOTE(rnp): first pass or last stage output 1202 { 1203 .gpu_buffer = &cc->ping_pong_buffer, 1204 .offset = pp_input_pointer - cc->ping_pong_buffer.gpu_pointer, 1205 .size = pp_size, 1206 }, 1207 // NOTE(rnp): output for DAS 1208 { 1209 .gpu_buffer = &cc->ping_pong_buffer, 1210 .offset = pp_das_pointer - cc->ping_pong_buffer.gpu_pointer, 1211 .size = pp_size, 1212 }, 1213 }; 1214 1215 u32 barrier_count = 1; 1216 if (shader_slot + 1 == das_index) 1217 barrier_count++; 1218 1219 vk_command_buffer_memory_barriers(cmd, memory_barriers, barrier_count); 1220 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1221 vk_command_dispatch_compute(cmd, dispatch); 1222 1223 cc->ping_pong_input_index = !cc->ping_pong_input_index; 1224 }break; 1225 1226 // NOTE(rnp): invalid stages should be filtered in planning phase 1227 InvalidDefaultCase; 1228 } 1229 1230 #if 0 1231 switch (shader) { 1232 case BeamformerShaderKind_MinMax:{ 1233 for (u32 i = 1; i < frame->image.mip_map_levels; i++) { 1234 glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 1235 glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 1236 glProgramUniform1i(program, MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); 1237 1238 u32 width = (u32)frame->dim.x >> i; 1239 u32 height = (u32)frame->dim.y >> i; 1240 u32 depth = (u32)frame->dim.z >> i; 1241 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 1242 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 1243 } 1244 }break; 1245 case BeamformerShaderKind_Sum:{ 1246 u32 aframe_index = ctx->averaged_frame_index % countof(ctx->averaged_frames); 1247 BeamformerFrame *aframe = ctx->averaged_frames + aframe_index; 1248 aframe->id = ctx->averaged_frame_index; 1249 atomic_store_u32(&aframe->ready_to_present, 0); 1250 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 1251 * this is fine for rolling averaging but what if we want to do something else */ 1252 assert(frame >= ctx->beamform_frames); 1253 assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames)); 1254 u32 base_index = (u32)(frame - ctx->beamform_frames); 1255 u32 to_average = (u32)cp->average_frames; 1256 u32 frame_count = 0; 1257 u32 *in_textures = push_array(&arena, u32, BeamformerMaxBacklogFrames); 1258 ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, to_average); 1259 for (BeamformerFrame *it = frame_next(&cfi); it; it = frame_next(&cfi)) 1260 in_textures[frame_count++] = it->texture; 1261 1262 assert(to_average == frame_count); 1263 1264 glProgramUniform1f(program, SUM_PRESCALE_UNIFORM_LOC, 1 / (f32)frame_count); 1265 /* NOTE: zero output before summing */ 1266 glClearTexImage(aframe->texture, 0, GL_RED, GL_FLOAT, 0); 1267 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 1268 1269 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 1270 for (u32 i = 0; i < in_texture_count; i++) { 1271 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 1272 glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); 1273 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 1274 } 1275 1276 mem_copy(aframe->voxel_transform.E, frame->voxel_transform.E, sizeof(frame->voxel_transform)); 1277 aframe->compound_count = frame->compound_count; 1278 aframe->acquisition_kind = frame->acquisition_kind; 1279 }break; 1280 } 1281 #endif 1282 } 1283 1284 function void 1285 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena) 1286 { 1287 BeamformerComputeContext * cs = &ctx->compute_context; 1288 BeamformerSharedMemory * sm = ctx->shared_memory; 1289 1290 for (BeamformWork *work = beamform_work_queue_pop(q); 1291 work; 1292 beamform_work_queue_pop_commit(q), work = beamform_work_queue_pop(q)) 1293 { 1294 switch (work->kind) { 1295 1296 case BeamformerWorkKind_ExportBuffer:{ 1297 /* TODO(rnp): better way of handling DispatchCompute barrier */ 1298 post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute); 1299 beamformer_shared_memory_take_lock(ctx->shared_memory, (i32)work->lock, (u32)-1); 1300 BeamformerExportContext *ec = &work->export_context; 1301 switch (ec->kind) { 1302 case BeamformerExportKind_BeamformedData:{ 1303 BeamformerFrame *f = ctx->latest_frame; 1304 if (f) { 1305 u64 frame_size = beamformer_frame_byte_size(f->points, f->data_kind); 1306 assert((frame_size & 63) == 0); 1307 if (frame_size <= ec->size) { 1308 vk_host_wait_timeline(VulkanTimeline_Compute, f->timeline_valid_value, -1ULL); 1309 vk_buffer_range_download(beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, 1310 ctx->compute_context.backlog.buffer, f->buffer_offset, 1311 frame_size, 1); 1312 } 1313 } 1314 }break; 1315 case BeamformerExportKind_Stats:{ 1316 ComputeTimingTable *table = ctx->compute_timing_table; 1317 /* NOTE(rnp): do a little spin to let this finish updating */ 1318 spin_wait(table->write_index != atomic_load_u32(&table->read_index)); 1319 ComputeShaderStats *stats = ctx->compute_shader_stats; 1320 if (sizeof(stats->table) <= ec->size) 1321 mem_copy(beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, 1322 &stats->table, sizeof(stats->table)); 1323 }break; 1324 InvalidDefaultCase; 1325 } 1326 beamformer_shared_memory_release_lock(ctx->shared_memory, work->lock); 1327 post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync); 1328 }break; 1329 1330 case BeamformerWorkKind_CreateFilter:{ 1331 /* TODO(rnp): this should probably get deleted and moved to lazy loading */ 1332 BeamformerCreateFilterContext *fctx = &work->create_filter_context; 1333 u32 block = fctx->parameter_block; 1334 u32 slot = fctx->filter_slot; 1335 BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, block, arena); 1336 beamformer_filter_update(cp->filters + slot, fctx->parameters, block, slot, *arena); 1337 }break; 1338 1339 case BeamformerWorkKind_ComputeIndirect: 1340 case BeamformerWorkKind_Compute: 1341 { 1342 push_compute_timing_info(ctx->compute_timing_table, 1343 (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin}); 1344 1345 BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, work->compute_context.parameter_block, arena); 1346 if unlikely(beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) { 1347 u32 block = work->compute_context.parameter_block; 1348 beamformer_commit_parameter_block(ctx, cp, block, *arena); 1349 } 1350 1351 post_sync_barrier(ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute); 1352 1353 u32 dirty_programs = atomic_swap_u32(&cp->dirty_programs, 0); 1354 static_assert(BeamformerMaxComputeShaderStages <= 32, ""); 1355 if unlikely(dirty_programs) { 1356 for EachBit(dirty_programs, slot) { 1357 assert(slot < BeamformerMaxComputeShaderStages); 1358 beamformer_reload_compute_pipeline(cp->vulkan_pipelines + slot, 1359 cp->pipeline.shaders[slot], 1360 cp->shader_descriptors + slot, *arena); 1361 } 1362 } 1363 1364 atomic_store_u32(&cs->processing_compute, 1); 1365 1366 start_renderdoc_capture(); 1367 1368 i32 das_index = -1; 1369 b32 has_sum = 0; 1370 for (u32 i = 0; i < cp->pipeline.shader_count; i++) { 1371 has_sum |= cp->pipeline.shaders[i] == BeamformerShaderKind_Sum; 1372 if (cp->pipeline.shaders[i] == BeamformerShaderKind_DAS) 1373 das_index = (i32)i; 1374 } 1375 1376 b32 das_coherent = das_index >= 0 && cp->shader_descriptors[das_index].bake.DAS.coherency_weighting; 1377 u64 reserved_frame_size = 0; 1378 1379 if (has_sum) 1380 reserved_frame_size += beamformer_frame_byte_size(cp->output_points, cp->iq_pipeline ? 1381 BeamformerDataKind_Float32Complex : 1382 BeamformerDataKind_Float32); 1383 1384 // TODO(rnp): incoherent sum for different data kinds 1385 if (das_coherent) 1386 reserved_frame_size += beamformer_frame_byte_size(cp->output_points, BeamformerDataKind_Float32); 1387 1388 BeamformerFrame *frame = beamformer_frame_next(cs, cp->output_points, cp->iq_pipeline, reserved_frame_size); 1389 frame->acquisition_kind = cp->acquisition_kind; 1390 frame->compound_count = cp->acquisition_count; 1391 frame->view_plane_tag = work->compute_context.view_plane; 1392 mem_copy(frame->voxel_transform.E, cp->voxel_transform.E, sizeof(cp->voxel_transform)); 1393 1394 VulkanHandle cmd = vk_command_begin(VulkanTimeline_Compute); 1395 vk_command_timestamp(cmd); 1396 1397 if (das_index >= 0) { 1398 GPUBuffer *backlog = cs->backlog.buffer; 1399 u32 subgroup_size = vk_gpu_info()->subgroup_size; 1400 BeamformerBufferClearPushConstants pc = { 1401 .data = backlog->gpu_pointer + frame->buffer_offset, 1402 .clear_v4 = (uv4){{0}}, 1403 .bins = beamformer_frame_byte_size(frame->points, frame->data_kind) / sizeof(uv4), 1404 }; 1405 1406 u32 index = BeamformerShaderKind_BufferClear - BeamformerShaderKind_ComputeInternalFirst; 1407 vk_command_bind_pipeline(cmd, cs->compute_internal_pipelines[index]); 1408 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1409 vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.bins / subgroup_size), 1, 1}}); 1410 1411 if (das_coherent) { 1412 assert((pc.bins % beamformer_data_kind_element_count[frame->data_kind]) == 0); 1413 pc.bins = pc.bins / beamformer_data_kind_element_count[frame->data_kind]; 1414 pc.data = backlog->gpu_pointer + backlog->size - sizeof(uv4) * pc.bins; 1415 vk_command_push_constants(cmd, 0, sizeof(pc), &pc); 1416 vk_command_dispatch_compute(cmd, (uv3){{(u32)ceil_f32((f32)pc.bins / subgroup_size), 1, 1}}); 1417 } 1418 } 1419 1420 BeamformerRFBuffer *rf = &cs->rf_buffer; 1421 u32 compute_index = rf->compute_index; 1422 u32 slot = compute_index % countof(rf->upload_complete_values); 1423 1424 if (work->kind == BeamformerWorkKind_ComputeIndirect) { 1425 // TODO(rnp): this shouldn't be necessary, there should be a way of communicating 1426 // what the value will be so that the only the command wait is needed. 1427 spin_wait(atomic_load_u64(&rf->insertion_index) <= compute_index); 1428 1429 /* NOTE(rnp): if the GPU supports BAR there may be no need to synchronize 1430 * other than the above spin */ 1431 if (vk_buffer_needs_sync(&rf->buffer)) 1432 vk_command_wait_timeline(cmd, VulkanTimeline_Transfer, rf->upload_complete_values[slot]); 1433 } else { 1434 slot = (rf->compute_index - 1) % countof(rf->upload_complete_values); 1435 } 1436 1437 for (u32 channel_offset = 0; 1438 channel_offset < cp->channel_count; 1439 channel_offset += BeamformerChunkChannelCount) 1440 { 1441 u64 rf_pointer = rf->buffer.gpu_pointer + slot * rf->active_rf_size; 1442 rf_pointer += cp->raw_channel_byte_stride * channel_offset; 1443 for (u32 i = 0; i < cp->first_image_shader_index; i++) { 1444 do_compute_shader(ctx, cmd, cp, frame, i, channel_offset, rf_pointer, *arena); 1445 vk_command_timestamp(cmd); 1446 } 1447 } 1448 1449 for (u32 i = cp->first_image_shader_index; i < cp->pipeline.shader_count; i++) { 1450 do_compute_shader(ctx, cmd, cp, frame, i, 0, 0, *arena); 1451 vk_command_timestamp(cmd); 1452 } 1453 1454 u64 end_timeline_value = vk_command_end(cmd, (VulkanHandle){0}, (VulkanHandle){0}); 1455 if (work->kind == BeamformerWorkKind_ComputeIndirect) { 1456 atomic_store_u64(rf->compute_complete_values + slot, end_timeline_value); 1457 atomic_add_u64(&rf->compute_index, 1); 1458 } 1459 1460 atomic_store_u64(&frame->timeline_valid_value, end_timeline_value); 1461 1462 { 1463 Arena scratch = *arena; 1464 /* NOTE(rnp): this blocks until work completes */ 1465 u64 *timestamps = vk_command_read_timestamps(VulkanTimeline_Compute, &scratch); 1466 1467 i32 steps = ((i32)cp->channel_count / BeamformerChunkChannelCount) - 1; 1468 i32 step = 0; 1469 u32 shader_index = 0; 1470 u64 last_time = timestamps[0] > 0 ? timestamps[1] : 0; 1471 1472 for (u64 i = 2; i < timestamps[0] + 1; i++) { 1473 push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){ 1474 .kind = ComputeTimingInfoKind_Shader, 1475 .shader = cp->pipeline.shaders[shader_index], 1476 .shader_slot = shader_index, 1477 .timer_count = timestamps[i] - last_time, 1478 }); 1479 last_time = timestamps[i]; 1480 1481 shader_index++; 1482 if (shader_index == cp->first_image_shader_index && step < steps) { 1483 shader_index = 0; 1484 step++; 1485 } 1486 } 1487 } 1488 1489 cs->processing_progress = 1; 1490 1491 if (has_sum) { 1492 #if 0 1493 u32 aframe_index = ((ctx->averaged_frame_index++) % countof(ctx->averaged_frames)); 1494 ctx->averaged_frames[aframe_index].view_plane_tag = frame->view_plane_tag; 1495 ctx->averaged_frames[aframe_index].ready_to_present = 1; 1496 atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index)); 1497 #endif 1498 } else { 1499 atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame); 1500 } 1501 1502 atomic_store_u32(&cs->processing_compute, 0); 1503 1504 push_compute_timing_info(ctx->compute_timing_table, 1505 (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd}); 1506 1507 end_renderdoc_capture(); 1508 }break; 1509 InvalidDefaultCase; 1510 } 1511 } 1512 } 1513 1514 function void 1515 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats) 1516 { 1517 /* TODO(rnp): we do not currently do anything to handle the potential for a half written 1518 * info item. this could result in garbage entries but they shouldn't really matter */ 1519 1520 u32 target = atomic_load_u32(&t->write_index); 1521 u32 stats_index = stats->latest_frame_index; 1522 1523 b32 has_rf = 0; 1524 f32 gpu_clocks_to_nano = 1.0e-9f * vk_gpu_info()->timestamp_period_ns; 1525 1526 // NOTE(rnp): not equal (the index may wrap) 1527 while (t->read_index != target) { 1528 ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)]; 1529 switch (info.kind) { 1530 1531 case ComputeTimingInfoKind_ComputeFrameBegin:{ 1532 assert(t->compute_frame_active == 0); 1533 t->compute_frame_active = 1; 1534 /* NOTE(rnp): allow multiple instances of same shader to accumulate */ 1535 t->in_flight_shader_count = 0; 1536 memory_clear(t->in_flight_shader_ids, 0, sizeof(t->in_flight_shader_ids)); 1537 memory_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index])); 1538 }break; 1539 1540 case ComputeTimingInfoKind_ComputeFrameEnd:{ 1541 assert(t->compute_frame_active == 1); 1542 t->compute_frame_active = 0; 1543 stats_index = stats->latest_frame_index = (stats_index + 1) % countof(stats->table.times); 1544 stats->table.shader_count = t->in_flight_shader_count; 1545 mem_copy(stats->table.shader_ids, t->in_flight_shader_ids, sizeof(t->in_flight_shader_ids)); 1546 }break; 1547 1548 case ComputeTimingInfoKind_Shader:{ 1549 t->in_flight_shader_count = Max(t->in_flight_shader_count, info.shader_slot + 1u); 1550 t->in_flight_shader_ids[info.shader_slot] = info.shader; 1551 stats->table.times[stats_index][info.shader_slot] += info.timer_count * gpu_clocks_to_nano; 1552 }break; 1553 1554 case ComputeTimingInfoKind_RF_Data:{ 1555 stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas); 1556 f32 delta = info.timer_count / (f32)os_system_info()->timer_frequency; 1557 stats->table.rf_time_deltas[stats->latest_rf_index] = delta; 1558 has_rf = 1; 1559 }break; 1560 } 1561 /* NOTE(rnp): do this at the end so that stats table is always in a consistent state */ 1562 t->read_index++; 1563 } 1564 1565 for (u32 i = 0; i < stats->table.shader_count; i++) { 1566 f32 sum = 0; 1567 for EachElement(stats->table.times, it) 1568 sum += stats->table.times[it][i]; 1569 stats->average_times[i] = sum / countof(stats->table.times); 1570 } 1571 1572 if (has_rf) { 1573 f32 sum = 0; 1574 for EachElement(stats->table.rf_time_deltas, i) 1575 sum += stats->table.rf_time_deltas[i]; 1576 stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas); 1577 } 1578 } 1579 1580 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 1581 { 1582 BeamformerSharedMemory *sm = ctx->shared_memory; 1583 complete_queue(ctx, &sm->external_work_queue, arena); 1584 complete_queue(ctx, ctx->beamform_work_queue, arena); 1585 } 1586 1587 DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) 1588 { 1589 BeamformerSharedMemory *sm = ctx->shared_memory; 1590 BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace; 1591 BeamformerSharedMemoryLockKind upload_lock = BeamformerSharedMemoryLockKind_UploadRF; 1592 1593 u64 rf_block_rf_size; 1594 if (atomic_load_u32(sm->locks + upload_lock) && 1595 (rf_block_rf_size = atomic_swap_u64(&sm->rf_block_rf_size, 0))) 1596 { 1597 beamformer_shared_memory_take_lock(ctx->shared_memory, (i32)scratch_lock, (u32)-1); 1598 1599 BeamformerRFBuffer *rf = ctx->rf_buffer; 1600 1601 rf->active_rf_size = vk_round_up_to_sync_size(rf_block_rf_size & 0xFFFFFFFFULL, 64); 1602 if unlikely(rf->buffer.size < countof(rf->upload_complete_values) * rf->active_rf_size) { 1603 GPUBufferAllocateInfo allocate_info = { 1604 .size = countof(rf->upload_complete_values) * rf->active_rf_size, 1605 .flags = VulkanUsageFlag_HostReadWrite, 1606 .label = str8("RawRFBuffer"), 1607 }; 1608 vk_buffer_allocate(&rf->buffer, &allocate_info); 1609 } 1610 1611 u64 slot = rf->insertion_index % countof(rf->upload_complete_values); 1612 1613 /* NOTE(rnp): don't overwrite slot if the compute thread hasn't processed it */ 1614 spin_wait(atomic_load_u64(&rf->compute_index) < rf->insertion_index); 1615 vk_host_wait_timeline(VulkanTimeline_Compute, rf->compute_complete_values[slot], -1ULL); 1616 1617 vk_buffer_range_upload(&rf->buffer, beamformer_shared_memory_scratch_arena(sm, ctx->shared_memory_size).beg, 1618 slot * rf->active_rf_size, rf->active_rf_size, 1); 1619 store_fence(); 1620 1621 beamformer_shared_memory_release_lock(ctx->shared_memory, (i32)scratch_lock); 1622 post_sync_barrier(ctx->shared_memory, upload_lock); 1623 1624 atomic_store_u64(rf->upload_complete_values + slot, vk_host_signal_timeline(VulkanTimeline_Transfer)); 1625 atomic_add_u64(&rf->insertion_index, 1); 1626 1627 os_wake_all_waiters(ctx->compute_worker_sync); 1628 1629 u64 current_time = os_timer_count(); 1630 push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){ 1631 .kind = ComputeTimingInfoKind_RF_Data, 1632 .timer_count = current_time - rf->timestamp, 1633 }); 1634 rf->timestamp = current_time; 1635 } 1636 } 1637 1638 function void 1639 beamformer_queue_compute(BeamformerCtx *ctx, BeamformerFrame *frame, u32 parameter_block) 1640 { 1641 BeamformerSharedMemory *sm = ctx->shared_memory; 1642 BeamformerSharedMemoryLockKind dispatch_lock = BeamformerSharedMemoryLockKind_DispatchCompute; 1643 if (!sm->live_imaging_parameters.active && beamformer_shared_memory_take_lock(sm, (i32)dispatch_lock, 0)) 1644 { 1645 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 1646 if (work) { 1647 work->kind = BeamformerWorkKind_Compute; 1648 work->compute_context.view_plane = frame ? frame->view_plane_tag : 0; 1649 work->compute_context.parameter_block = parameter_block; 1650 beamform_work_queue_push_commit(ctx->beamform_work_queue); 1651 } 1652 } 1653 os_wake_all_waiters(&ctx->compute_worker.sync_variable); 1654 } 1655 1656 #include "ui.c" 1657 1658 function void 1659 beamformer_process_input_events(BeamformerCtx *ctx, BeamformerInput *input, 1660 BeamformerInputEvent *events, u32 event_count) 1661 { 1662 for (u32 index = 0; index < event_count; index++) { 1663 BeamformerInputEvent *event = events + index; 1664 switch (event->kind) { 1665 1666 // NOTE(rnp): ui will handle these 1667 case BeamformerInputEventKind_ButtonPress: 1668 case BeamformerInputEventKind_ButtonRelease: 1669 case BeamformerInputEventKind_MouseScroll: 1670 case BeamformerInputEventKind_WindowResize: 1671 {}break; 1672 1673 case BeamformerInputEventKind_ExecutableReload:{ 1674 ui_init(ctx, ctx->ui_backing_store); 1675 1676 if (!vk_pipeline_valid(ctx->compute_context.compute_internal_pipelines[0])) { 1677 for EachElement(ctx->compute_context.compute_internal_pipelines, it) { 1678 beamformer_reload_compute_pipeline(ctx->compute_context.compute_internal_pipelines + it, 1679 BeamformerShaderKind_ComputeInternalFirst + it, 0, 1680 ctx->arena); 1681 } 1682 } 1683 }break; 1684 1685 case BeamformerInputEventKind_FileEvent:{ 1686 BeamformerFileReloadContext *frc = event->file_watch_user_context; 1687 switch (frc->kind) { 1688 case BeamformerFileReloadKind_ComputeInternalShader:{ 1689 // TODO(rnp): this could stall, better to push it onto compute once queue is better 1690 beamformer_reload_compute_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, 0, ctx->arena); 1691 }break; 1692 1693 case BeamformerFileReloadKind_ComputeShader:{ 1694 for EachElement(ctx->compute_context.compute_plans, block) { 1695 BeamformerComputePlan *cp = ctx->compute_context.compute_plans[block]; 1696 for (u32 slot = 0; cp && slot < cp->pipeline.shader_count; slot++) { 1697 i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]]; 1698 if (beamformer_reloadable_shader_kinds[shader_index] == frc->shader_reload.shader) 1699 atomic_or_u32(&cp->dirty_programs, 1 << slot); 1700 } 1701 } 1702 1703 // TODO(rnp): track latest parameter block 1704 if (ctx->latest_frame) 1705 beamformer_queue_compute(ctx, ctx->latest_frame, 0); 1706 }break; 1707 1708 case BeamformerFileReloadKind_RenderShader:{ 1709 beamformer_reload_render_pipeline(frc->shader_reload.pipeline, frc->shader_reload.shader, ctx->arena); 1710 ctx->render_shader_updated = 1; 1711 }break; 1712 1713 InvalidDefaultCase; 1714 } 1715 }break; 1716 1717 InvalidDefaultCase; 1718 } 1719 } 1720 } 1721 1722 BEAMFORMER_EXPORT void 1723 beamformer_frame_step(BeamformerInput *input) 1724 { 1725 BeamformerCtx *ctx = BeamformerContextMemory(input->memory); 1726 1727 u64 current_time = os_timer_count(); 1728 dt_for_frame = (f64)(current_time - ctx->frame_timestamp) / os_system_info()->timer_frequency; 1729 ctx->frame_timestamp = current_time; 1730 1731 coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats); 1732 1733 beamformer_process_input_events(ctx, input, input->event_queue, input->event_count); 1734 1735 BeamformerSharedMemory *sm = ctx->shared_memory; 1736 if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_UploadRF)) 1737 os_wake_all_waiters(&ctx->upload_worker.sync_variable); 1738 if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_DispatchCompute)) 1739 os_wake_all_waiters(&ctx->compute_worker.sync_variable); 1740 1741 BeamformerFrame *frame = ctx->latest_frame; 1742 BeamformerViewPlaneTag tag = frame? frame->view_plane_tag : 0; 1743 draw_ui(ctx, input, frame, tag); 1744 1745 ctx->render_shader_updated = 0; 1746 }