beamformer.c - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

beamformer.c (21661B)
      1 /* See LICENSE for license details. */
      2 
      3 #include "beamformer_internal.h"
      4 
      5 /* NOTE(rnp): magic variables to force discrete GPU usage on laptops with multiple devices */
      6 EXPORT i32 NvOptimusEnablement = 1;
      7 EXPORT i32 AmdPowerXpressRequestHighPerformance = 1;
      8 
      9 #if !BEAMFORMER_DEBUG
     10 #include "beamformer_core.c"
     11 #else
     12 
     13 typedef void beamformer_frame_step_fn(BeamformerInput *);
     14 
     15 #define BEAMFORMER_DEBUG_ENTRY_POINTS \
     16 	X(beamformer_debug_ui_deinit)  \
     17 	X(beamformer_complete_compute) \
     18 	X(beamformer_frame_step)       \
     19 	X(beamformer_rf_upload)        \
     20 
     21 #define X(name) global name ##_fn *name;
     22 BEAMFORMER_DEBUG_ENTRY_POINTS
     23 #undef X
     24 
     25 BEAMFORMER_EXPORT void
     26 beamformer_debug_hot_release(BeamformerInput *input)
     27 {
     28 	BeamformerCtx *ctx = BeamformerContextMemory(input->memory);
     29 
     30 	// TODO(rnp): this will deadlock if live imaging is active
     31 	/* NOTE(rnp): spin until compute thread finishes its work (we will probably
     32 	 * never reload while compute is in progress but just incase). */
     33 	spin_wait(atomic_load_u32(&ctx->upload_worker.awake));
     34 	spin_wait(atomic_load_u32(&ctx->compute_worker.awake));
     35 }
     36 
     37 BEAMFORMER_EXPORT void
     38 beamformer_debug_hot_reload(OSLibrary library, BeamformerInput *input)
     39 {
     40 	#define X(name) name = os_lookup_symbol(library, #name);
     41 	BEAMFORMER_DEBUG_ENTRY_POINTS
     42 	#undef X
     43 
     44 	s8 info = beamformer_info("reloaded main executable");
     45 	os_console_log(info.data, info.len);
     46 }
     47 
     48 #endif /* BEAMFORMER_DEBUG */
     49 
     50 function no_return void
     51 fatal(s8 message)
     52 {
     53 	os_fatal(message.data, message.len);
     54 	unreachable();
     55 }
     56 
     57 // TODO(rnp): none of this belongs here, but will be removed
     58 // once vulkan migration is complete
     59 #define GLFW_VISIBLE 0x00020004
     60 void   glfwWindowHint(i32, i32);
     61 iptr   glfwCreateWindow(i32, i32, char *, iptr, iptr);
     62 void   glfwMakeContextCurrent(iptr);
     63 iptr   glfwGetGLXContext(iptr);
     64 iptr   glfwGetWGLContext(iptr);
     65 void * glfwGetProcAddress(char *);
     66 
     67 #if OS_WINDOWS
     68 function iptr
     69 os_get_native_gl_context(iptr window)
     70 {
     71 	return glfwGetWGLContext(window);
     72 }
     73 #else
     74 function iptr
     75 os_get_native_gl_context(iptr window)
     76 {
     77 	return glfwGetGLXContext(window);
     78 }
     79 #endif
     80 
     81 function void
     82 gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx)
     83 {
     84 	Stream *e = (Stream *)userctx;
     85 	stream_append_s8s(e, s8("[OpenGL] "), (s8){.len = len, .data = (u8 *)msg}, s8("\n"));
     86 	os_console_log(e->data, e->widx);
     87 	stream_reset(e, 0);
     88 }
     89 
     90 function void
     91 load_gl(Stream *err)
     92 {
     93 	#define X(name, ret, params) name = (name##_fn *)glfwGetProcAddress(#name);
     94 	OGLProcedureList
     95 	#undef X
     96 
     97 	/* NOTE: Gather information about the GPU */
     98 	{
     99 		char *vendor = (char *)glGetString(GL_VENDOR);
    100 		if (!vendor) {
    101 			stream_append_s8(err, s8("Failed to determine GL Vendor\n"));
    102 			fatal(stream_to_s8(err));
    103 		}
    104 		/* TODO(rnp): str prefix of */
    105 		switch (vendor[0]) {
    106 		case 'A': gl_parameters.vendor_id = GLVendor_AMD;    break;
    107 		case 'I': gl_parameters.vendor_id = GLVendor_Intel;  break;
    108 		case 'N': gl_parameters.vendor_id = GLVendor_NVIDIA; break;
    109 		/* NOTE(rnp): freedreno */
    110 		case 'f': gl_parameters.vendor_id = GLVendor_ARM;    break;
    111 		/* NOTE(rnp): Microsoft Corporation - weird win32 thing (microsoft is just using mesa for the driver) */
    112 		case 'M': gl_parameters.vendor_id = GLVendor_ARM;    break;
    113 		default:
    114 			stream_append_s8s(err, s8("Unknown GL Vendor: "), c_str_to_s8(vendor), s8("\n"));
    115 			fatal(stream_to_s8(err));
    116 		}
    117 
    118 		#define X(glname, name, suffix) glGetIntegerv(GL_##glname, &gl_parameters.name);
    119 		GL_PARAMETERS
    120 		#undef X
    121 	}
    122 
    123 #ifdef _DEBUG
    124 	{
    125 		s8 vendor = s8("vendor:");
    126 		i32 max_width = (i32)vendor.len;
    127 		#define X(glname, name, suffix) if (s8(#name).len > max_width) max_width = (i32)s8(#name ":").len;
    128 		GL_PARAMETERS
    129 		#undef X
    130 		max_width++;
    131 
    132 		stream_append_s8s(err, s8("---- GL Parameters ----\n"), vendor);
    133 		stream_pad(err, ' ', max_width - (i32)vendor.len);
    134 		switch (gl_parameters.vendor_id) {
    135 		case GLVendor_AMD:    stream_append_s8(err, s8("AMD"));    break;
    136 		case GLVendor_ARM:    stream_append_s8(err, s8("ARM"));    break;
    137 		case GLVendor_Intel:  stream_append_s8(err, s8("Intel"));  break;
    138 		case GLVendor_NVIDIA: stream_append_s8(err, s8("nVidia")); break;
    139 		}
    140 		stream_append_byte(err, '\n');
    141 
    142 		#define X(glname, name, suffix) \
    143 			stream_append_s8(err, s8(#name ":"));                     \
    144 			stream_pad(err, ' ', max_width - (i32)s8(#name ":").len); \
    145 			stream_append_i64(err, gl_parameters.name);               \
    146 			stream_append_s8(err, s8(suffix "\n"));
    147 		GL_PARAMETERS
    148 		#undef X
    149 		stream_append_s8(err, s8("-----------------------\n"));
    150 		os_console_log(err->data, err->widx);
    151 	}
    152 #endif
    153 
    154 	{
    155 		stream_reset(err, 0);
    156 		if (gl_parameters.max_ubo_size < (i32)sizeof(BeamformerParameters)) {
    157 			stream_append_s8(err, s8("GPU must support UBOs of at least "));
    158 			stream_append_i64(err, sizeof(BeamformerParameters));
    159 			stream_append_s8(err, s8(" bytes!\n"));
    160 		}
    161 
    162 		#define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n"));
    163 		OGLProcedureList
    164 		#undef X
    165 
    166 		if (err->widx) fatal(stream_to_s8(err));
    167 	}
    168 }
    169 
    170 function void
    171 beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena)
    172 {
    173 	/* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently
    174 	 * causing a major performance regression. for now we are disabling its use
    175 	 * altogether. it will be reenabled once the issue can be fixed */
    176 	b32 result = 0 && gl_parameters.vendor_id == GLVendor_NVIDIA && ValidHandle(cuda);
    177 	if (result) {
    178 		Stream err = arena_stream(arena);
    179 
    180 		stream_append_s8(&err, beamformer_info("loading CUDA library functions"));
    181 		#define X(name, symname) cuda_## name = os_lookup_symbol(cuda, symname);
    182 		CUDALibraryProcedureList
    183 		#undef X
    184 
    185 		os_console_log(err.data, err.widx);
    186 	}
    187 
    188 	#define X(name, symname) if (!cuda_## name) cuda_## name = cuda_ ## name ## _stub;
    189 	CUDALibraryProcedureList
    190 	#undef X
    191 }
    192 
    193 function BeamformerRenderModel
    194 render_model_from_arrays(f32 *vertices, f32 *normals, i32 vertices_size, u16 *indices, i32 index_count)
    195 {
    196 	BeamformerRenderModel result = {0};
    197 
    198 	i32 buffer_size    = vertices_size * 2 + index_count * (i32)sizeof(u16);
    199 	i32 indices_offset = vertices_size * 2;
    200 	i32 indices_size   = index_count * (i32)sizeof(u16);
    201 
    202 	result.elements        = index_count;
    203 	result.elements_offset = indices_offset;
    204 
    205 	glCreateBuffers(1, &result.buffer);
    206 	glNamedBufferStorage(result.buffer, buffer_size, 0, GL_DYNAMIC_STORAGE_BIT);
    207 	glNamedBufferSubData(result.buffer, 0,              vertices_size, vertices);
    208 	glNamedBufferSubData(result.buffer, vertices_size,  vertices_size, normals);
    209 	glNamedBufferSubData(result.buffer, indices_offset, indices_size,  indices);
    210 
    211 	glCreateVertexArrays(1, &result.vao);
    212 	glVertexArrayVertexBuffer(result.vao, 0, result.buffer, 0,             3 * sizeof(f32));
    213 	glVertexArrayVertexBuffer(result.vao, 1, result.buffer, vertices_size, 3 * sizeof(f32));
    214 	glVertexArrayElementBuffer(result.vao, result.buffer);
    215 
    216 	glEnableVertexArrayAttrib(result.vao, 0);
    217 	glEnableVertexArrayAttrib(result.vao, 1);
    218 
    219 	glVertexArrayAttribFormat(result.vao, 0, 3, GL_FLOAT, 0, 0);
    220 	glVertexArrayAttribFormat(result.vao, 1, 3, GL_FLOAT, 0, (u32)vertices_size);
    221 
    222 	glVertexArrayAttribBinding(result.vao, 0, 0);
    223 	glVertexArrayAttribBinding(result.vao, 1, 0);
    224 
    225 	return result;
    226 }
    227 
    228 function void
    229 worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm)
    230 {
    231 	for (;;) {
    232 		i32 expected = 0;
    233 		if (atomic_cas_u32(&ctx->sync_variable, &expected, 1) ||
    234 		    atomic_load_u32(&sm->live_imaging_parameters.active))
    235 		{
    236 			break;
    237 		}
    238 
    239 		/* TODO(rnp): clean this crap up; we shouldn't need two values to communicate this */
    240 		atomic_store_u32(&ctx->awake, 0);
    241 		os_wait_on_address(&ctx->sync_variable, 1, (u32)-1);
    242 		atomic_store_u32(&ctx->awake, 1);
    243 	}
    244 }
    245 
    246 function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
    247 {
    248 	GLWorkerThreadContext *ctx = user_context;
    249 
    250 	glfwMakeContextCurrent(ctx->window_handle);
    251 	ctx->gl_context = os_get_native_gl_context(ctx->window_handle);
    252 
    253 	BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context;
    254 	glCreateQueries(GL_TIME_ELAPSED, countof(beamformer->compute_context.shader_timer_ids),
    255 	                beamformer->compute_context.shader_timer_ids);
    256 
    257 	for (;;) {
    258 		worker_thread_sleep(ctx, beamformer->shared_memory);
    259 		asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg);
    260 		beamformer_complete_compute(ctx->user_context, &ctx->arena, ctx->gl_context);
    261 	}
    262 
    263 	unreachable();
    264 
    265 	return 0;
    266 }
    267 
    268 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point)
    269 {
    270 	GLWorkerThreadContext *ctx = user_context;
    271 	glfwMakeContextCurrent(ctx->window_handle);
    272 	ctx->gl_context = os_get_native_gl_context(ctx->window_handle);
    273 
    274 	BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context;
    275 	glCreateQueries(GL_TIMESTAMP, 1, &up->rf_buffer->data_timestamp_query);
    276 	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
    277 	glQueryCounter(up->rf_buffer->data_timestamp_query, GL_TIMESTAMP);
    278 
    279 	for (;;) {
    280 		worker_thread_sleep(ctx, up->shared_memory);
    281 		beamformer_rf_upload(up);
    282 	}
    283 
    284 	unreachable();
    285 
    286 	return 0;
    287 }
    288 
    289 BEAMFORMER_EXPORT void
    290 beamformer_init(BeamformerInput *input)
    291 {
    292 	Arena  memory        = arena_from_memory(input->memory, input->memory_size);
    293 	Arena  compute_arena = sub_arena_end(&memory, MB(2), KB(4));
    294 	Arena  upload_arena  = sub_arena_end(&memory, KB(4), KB(4));
    295 	Arena  ui_arena      = sub_arena_end(&memory, MB(2), KB(4));
    296 	Stream error         = arena_stream(sub_arena_end(&memory, MB(1), 1));
    297 	BeamformerCtx *ctx   = push_struct(&memory, BeamformerCtx);
    298 
    299 	Arena scratch = {.beg = memory.end - 4096L, .end = memory.end};
    300 	memory.end = scratch.beg;
    301 
    302 	ctx->window_size = (iv2){{1280, 840}};
    303 	ctx->error_stream = error;
    304 	ctx->ui_backing_store = ui_arena;
    305 
    306 	ctx->compute_worker.arena  = compute_arena;
    307 	ctx->upload_worker.arena   = upload_arena;
    308 
    309 	beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory);
    310 
    311 	SetConfigFlags(FLAG_VSYNC_HINT|FLAG_WINDOW_ALWAYS_RUN);
    312 	InitWindow(ctx->window_size.w, ctx->window_size.h, "OGL Beamformer");
    313 	/* NOTE: do this after initing so that the window starts out floating in tiling wm */
    314 	SetWindowState(FLAG_WINDOW_RESIZABLE);
    315 	SetWindowMinSize(840, ctx->window_size.h);
    316 
    317 	glfwWindowHint(GLFW_VISIBLE, 0);
    318 	iptr raylib_window_handle = (iptr)GetPlatformWindowHandle();
    319 
    320 	load_gl(&ctx->error_stream);
    321 
    322 	ctx->beamform_work_queue  = push_struct(&memory, BeamformWorkQueue);
    323 	ctx->compute_shader_stats = push_struct(&memory, ComputeShaderStats);
    324 	ctx->compute_timing_table = push_struct(&memory, ComputeTimingTable);
    325 
    326 	ctx->shared_memory      = input->shared_memory;
    327 	ctx->shared_memory_size = input->shared_memory_size;
    328 	if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory))
    329 		fatal(s8("Get more ram lol\n"));
    330 	zero_struct(ctx->shared_memory);
    331 
    332 	ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION;
    333 	ctx->shared_memory->reserved_parameter_blocks = 1;
    334 
    335 	/* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores
    336 	 * on w32 but thats what we are doing for now */
    337 	#if OS_WINDOWS
    338 	{
    339 		Stream sb = arena_stream(memory);
    340 		stream_append(&sb, input->shared_memory_name, input->shared_memory_name_length);
    341 		stream_append_s8(&sb, s8("_lock_"));
    342 		i32 start_index = sb.widx;
    343 		for EachElement(os_w32_shared_memory_semaphores, it) {
    344 			stream_reset(&sb, start_index);
    345 			stream_append_u64(&sb, it);
    346 			stream_append_byte(&sb, 0);
    347 			os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1);
    348 			if InvalidHandle(os_w32_shared_memory_semaphores[it])
    349 				fatal(beamformer_info("init: failed to create w32 shared memory semaphore\n"));
    350 
    351 			/* NOTE(rnp): hacky garbage because CreateSemaphore will just open an existing
    352 			 * semaphore without any indication. Sometimes the other side of the shared memory
    353 			 * will provide incorrect parameters or will otherwise fail and its faster to
    354 			 * restart this program than to get that application to release the semaphores */
    355 			/* TODO(rnp): figure out something more robust */
    356 			os_w32_semaphore_release(os_w32_shared_memory_semaphores[it], 1);
    357 		}
    358 	}
    359 	#endif
    360 
    361 	BeamformerComputeContext *cs = &ctx->compute_context;
    362 
    363 	GLWorkerThreadContext *worker = &ctx->compute_worker;
    364 	/* TODO(rnp): we should lock this down after we have something working */
    365 	worker->user_context  = (iptr)ctx;
    366 	worker->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
    367 	worker->handle        = os_create_thread("[compute]", worker, compute_worker_thread_entry_point);
    368 
    369 	GLWorkerThreadContext         *upload = &ctx->upload_worker;
    370 	BeamformerUploadThreadContext *upctx  = push_struct(&memory, typeof(*upctx));
    371 	upload->user_context        = (iptr)upctx;
    372 	upctx->rf_buffer            = &cs->rf_buffer;
    373 	upctx->shared_memory        = ctx->shared_memory;
    374 	upctx->shared_memory_size   = ctx->shared_memory_size;
    375 	upctx->compute_timing_table = ctx->compute_timing_table;
    376 	upctx->compute_worker_sync  = &ctx->compute_worker.sync_variable;
    377 	upload->window_handle       = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
    378 	upload->handle              = os_create_thread("[upload]", upload, beamformer_upload_entry_point);
    379 
    380 	glfwMakeContextCurrent(raylib_window_handle);
    381 
    382 	/* NOTE: set up OpenGL debug logging */
    383 	Stream *gl_error_stream = push_struct(&memory, Stream);
    384 	*gl_error_stream        = stream_alloc(&memory, 1024);
    385 	glDebugMessageCallback(gl_debug_logger, gl_error_stream);
    386 #ifdef _DEBUG
    387 	glEnable(GL_DEBUG_OUTPUT);
    388 #endif
    389 
    390 	if (!BakeShaders)
    391 	{
    392 		for EachElement(beamformer_reloadable_compute_shader_info_indices, it) {
    393 			i32   index = beamformer_reloadable_compute_shader_info_indices[it];
    394 			Arena temp  = scratch;
    395 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    396 			                             beamformer_reloadable_shader_files[index]);
    397 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    398 			frc->kind                = BeamformerFileReloadKind_ComputeShader;
    399 			frc->compute_shader_kind = beamformer_reloadable_shader_kinds[index];
    400 			os_add_file_watch((char *)file.data, file.len, frc);
    401 		}
    402 	}
    403 
    404 	FrameViewRenderContext *fvr = &ctx->frame_view_render_context;
    405 	glCreateFramebuffers(countof(fvr->framebuffers), fvr->framebuffers);
    406 	LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[0], s8("Frame View Framebuffer"));
    407 	LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[1], s8("Frame View Resolving Framebuffer"));
    408 
    409 	glCreateRenderbuffers(countof(fvr->renderbuffers), fvr->renderbuffers);
    410 	i32 msaa_samples = gl_parameters.vendor_id == GLVendor_ARM? 4 : 8;
    411 	glNamedRenderbufferStorageMultisample(fvr->renderbuffers[0], msaa_samples, GL_RGBA8,
    412 	                                      FRAME_VIEW_RENDER_TARGET_SIZE);
    413 	glNamedRenderbufferStorageMultisample(fvr->renderbuffers[1], msaa_samples, GL_DEPTH_COMPONENT24,
    414 	                                      FRAME_VIEW_RENDER_TARGET_SIZE);
    415 
    416 	static_assert(countof(beamformer_reloadable_render_shader_info_indices) == 1,
    417 	              "only a single render shader is currently handled");
    418 	i32 render_rsi_index = beamformer_reloadable_render_shader_info_indices[0];
    419 
    420 	// TODO(rnp): leaks when BakeShaders is true
    421 	Arena *arena = &memory;
    422 	BeamformerShaderReloadContext *render_3d = push_struct(arena, typeof(*render_3d));
    423 	render_3d->reloadable_info_index = render_rsi_index;
    424 	render_3d->gl_type = GL_FRAGMENT_SHADER;
    425 	render_3d->header  = s8(""
    426 	"layout(location = 0) in  vec3 normal;\n"
    427 	"layout(location = 1) in  vec3 texture_coordinate;\n\n"
    428 	"layout(location = 2) in  vec3 test_texture_coordinate;\n\n"
    429 	"layout(location = 0) out vec4 out_colour;\n\n"
    430 	"layout(location = " str(FRAME_VIEW_DYNAMIC_RANGE_LOC) ") uniform float u_db_cutoff = 60;\n"
    431 	"layout(location = " str(FRAME_VIEW_THRESHOLD_LOC)     ") uniform float u_threshold = 40;\n"
    432 	"layout(location = " str(FRAME_VIEW_GAMMA_LOC)         ") uniform float u_gamma     = 1;\n"
    433 	"layout(location = " str(FRAME_VIEW_LOG_SCALE_LOC)     ") uniform bool  u_log_scale;\n"
    434 	"layout(location = " str(FRAME_VIEW_BB_COLOUR_LOC)     ") uniform vec4  u_bb_colour   = vec4(" str(FRAME_VIEW_BB_COLOUR) ");\n"
    435 	"layout(location = " str(FRAME_VIEW_BB_FRACTION_LOC)   ") uniform float u_bb_fraction = " str(FRAME_VIEW_BB_FRACTION) ";\n"
    436 	"layout(location = " str(FRAME_VIEW_SOLID_BB_LOC)      ") uniform bool  u_solid_bb;\n"
    437 	"\n"
    438 	"layout(binding = 0) uniform sampler3D u_texture;\n");
    439 
    440 	render_3d->link = push_struct(arena, typeof(*render_3d));
    441 	render_3d->link->reloadable_info_index = -1;
    442 	render_3d->link->gl_type = GL_VERTEX_SHADER;
    443 	render_3d->link->link    = render_3d;
    444 	render_3d->link->header  = s8(""
    445 	"layout(location = 0) in vec3 v_position;\n"
    446 	"layout(location = 1) in vec3 v_normal;\n"
    447 	"\n"
    448 	"layout(location = 0) out vec3 f_normal;\n"
    449 	"layout(location = 1) out vec3 f_texture_coordinate;\n"
    450 	"layout(location = 2) out vec3 f_orig_texture_coordinate;\n"
    451 	"\n"
    452 	"layout(location = " str(FRAME_VIEW_MODEL_MATRIX_LOC)  ") uniform mat4  u_model;\n"
    453 	"layout(location = " str(FRAME_VIEW_VIEW_MATRIX_LOC)   ") uniform mat4  u_view;\n"
    454 	"layout(location = " str(FRAME_VIEW_PROJ_MATRIX_LOC)   ") uniform mat4  u_projection;\n"
    455 	"\n"
    456 	"\n"
    457 	"void main()\n"
    458 	"{\n"
    459 	"\tvec3 pos = v_position;\n"
    460 	"\tf_orig_texture_coordinate = (2 * v_position + 1) / 2;\n"
    461 	//"\tif (v_position.y == -1) pos.x = clamp(v_position.x, -u_clip_fraction, u_clip_fraction);\n"
    462 	"\tvec3 tex_coord = (2 * pos + 1) / 2;\n"
    463 	"\tf_texture_coordinate = tex_coord;\n"
    464 	//"\tf_texture_coordinate = u_swizzle? tex_coord.xzy : tex_coord;\n"
    465 	//"\tf_normal    = normalize(mat3(u_model) * v_normal);\n"
    466 	"\tf_normal    = v_normal;\n"
    467 	"\tgl_Position = u_projection * u_view * u_model * vec4(pos, 1);\n"
    468 	"}\n");
    469 
    470 	// TODO(rnp): this is probably not expected by the platform, refactor so that all
    471 	// needed context (eg. headers) are available outside of here and push initial load
    472 	// into ui_init
    473 	{
    474 		BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    475 		frc->kind                  = BeamformerFileReloadKind_Shader;
    476 		frc->shader_reload_context = render_3d;
    477 		input->event_queue[input->event_count++] = (BeamformerInputEvent){
    478 			.kind = BeamformerInputEventKind_FileEvent,
    479 			.file_watch_user_context = frc,
    480 		};
    481 
    482 		s8 render_file = {0};
    483 		if (!BakeShaders) {
    484 			render_file = push_s8_from_parts(&scratch, os_path_separator(), s8("shaders"),
    485 			                                 beamformer_reloadable_shader_files[render_rsi_index]);
    486 			os_add_file_watch((char *)render_file.data, render_file.len, frc);
    487 		}
    488 	}
    489 
    490 	f32 unit_cube_vertices[] = {
    491 		 0.5f,  0.5f, -0.5f,
    492 		 0.5f,  0.5f, -0.5f,
    493 		 0.5f,  0.5f, -0.5f,
    494 		 0.5f, -0.5f, -0.5f,
    495 		 0.5f, -0.5f, -0.5f,
    496 		 0.5f, -0.5f, -0.5f,
    497 		 0.5f,  0.5f,  0.5f,
    498 		 0.5f,  0.5f,  0.5f,
    499 		 0.5f,  0.5f,  0.5f,
    500 		 0.5f, -0.5f,  0.5f,
    501 		 0.5f, -0.5f,  0.5f,
    502 		 0.5f, -0.5f,  0.5f,
    503 		-0.5f,  0.5f, -0.5f,
    504 		-0.5f,  0.5f, -0.5f,
    505 		-0.5f,  0.5f, -0.5f,
    506 		-0.5f, -0.5f, -0.5f,
    507 		-0.5f, -0.5f, -0.5f,
    508 		-0.5f, -0.5f, -0.5f,
    509 		-0.5f,  0.5f,  0.5f,
    510 		-0.5f,  0.5f,  0.5f,
    511 		-0.5f,  0.5f,  0.5f,
    512 		-0.5f, -0.5f,  0.5f,
    513 		-0.5f, -0.5f,  0.5f,
    514 		-0.5f, -0.5f,  0.5f
    515 	};
    516 	f32 unit_cube_normals[] = {
    517 		 0.0f,  0.0f, -1.0f,
    518 		 0.0f,  1.0f,  0.0f,
    519 		 1.0f,  0.0f,  0.0f,
    520 		 0.0f,  0.0f, -1.0f,
    521 		 0.0f, -1.0f,  0.0f,
    522 		 1.0f,  0.0f,  0.0f,
    523 		 0.0f,  0.0f,  1.0f,
    524 		 0.0f,  1.0f,  0.0f,
    525 		 1.0f,  0.0f,  0.0f,
    526 		 0.0f,  0.0f,  1.0f,
    527 		 0.0f, -1.0f,  0.0f,
    528 		 1.0f,  0.0f,  0.0f,
    529 		 0.0f,  0.0f, -1.0f,
    530 		 0.0f,  1.0f,  0.0f,
    531 		-1.0f,  0.0f,  0.0f,
    532 		 0.0f,  0.0f, -1.0f,
    533 		 0.0f, -1.0f,  0.0f,
    534 		-1.0f,  0.0f,  0.0f,
    535 		 0.0f,  0.0f,  1.0f,
    536 		 0.0f,  1.0f,  0.0f,
    537 		-1.0f,  0.0f,  0.0f,
    538 		 0.0f,  0.0f,  1.0f,
    539 		 0.0f, -1.0f,  0.0f,
    540 		-1.0f,  0.0f,  0.0f
    541 	};
    542 	u16 unit_cube_indices[] = {
    543 		1,  13, 19,
    544 		1,  19, 7,
    545 		9,  6,  18,
    546 		9,  18, 21,
    547 		23, 20, 14,
    548 		23, 14, 17,
    549 		16, 4,  10,
    550 		16, 10, 22,
    551 		5,  2,  8,
    552 		5,  8,  11,
    553 		15, 12, 0,
    554 		15, 0,  3
    555 	};
    556 
    557 	cs->unit_cube_model = render_model_from_arrays(unit_cube_vertices, unit_cube_normals,
    558 	                                               sizeof(unit_cube_vertices),
    559 	                                               unit_cube_indices, countof(unit_cube_indices));
    560 
    561 	memory.end = scratch.end;
    562 	ctx->arena = memory;
    563 	ctx->state = BeamformerState_Running;
    564 }
    565 
    566 BEAMFORMER_EXPORT void
    567 beamformer_terminate(BeamformerInput *input)
    568 {
    569 	/* NOTE(rnp): work around pebkac when the beamformer is closed while we are doing live
    570 	 * imaging. if the verasonics is blocked in an external function (calling the library
    571 	 * to start compute) it is impossible for us to get it to properly shut down which
    572 	 * will sometimes result in us needing to power cycle the system. set the shared memory
    573 	 * into an error state and release dispatch lock so that future calls will error instead
    574 	 * of blocking.
    575 	 */
    576 	BeamformerCtx *          ctx = BeamformerContextMemory(input->memory);
    577 	BeamformerSharedMemory * sm  = input->shared_memory;
    578 	if (ctx->state != BeamformerState_Terminated) {
    579 		if (sm) {
    580 			BeamformerSharedMemoryLockKind lock = BeamformerSharedMemoryLockKind_DispatchCompute;
    581 			atomic_store_u32(&sm->invalid, 1);
    582 			atomic_store_u32(&sm->external_work_queue.ridx, sm->external_work_queue.widx);
    583 			DEBUG_DECL(if (sm->locks[lock])) {
    584 				beamformer_shared_memory_release_lock(sm, (i32)lock);
    585 			}
    586 
    587 			atomic_or_u32(&sm->live_imaging_dirty_flags, BeamformerLiveImagingDirtyFlags_StopImaging);
    588 		}
    589 
    590 		beamformer_debug_ui_deinit(ctx);
    591 
    592 		ctx->state = BeamformerState_Terminated;
    593 	}
    594 }
    595 
    596 BEAMFORMER_EXPORT u32
    597 beamformer_should_close(BeamformerInput *input)
    598 {
    599 	BeamformerCtx * ctx = BeamformerContextMemory(input->memory);
    600 	if (ctx->state == BeamformerState_ShouldClose)
    601 		beamformer_terminate(input);
    602 	return ctx->state == BeamformerState_Terminated;
    603 }