diff options
author | Marek Olšák <marek.olsak@amd.com> | 2020-09-21 07:33:59 -0400 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2020-11-12 21:02:05 +0000 |
commit | 96c12b7dc20d05dff94a947851f08d9ccbfb72ad (patch) | |
tree | 347634842cca1dd71c22225a60065ddc9505c91d | |
parent | 99e17b0c4adb81f93ba9b98b754ac71f6f334c3c (diff) | |
download | external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.gz external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.bz2 external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.zip |
nir: optionally shuffle local invocation IDs for compute quad derivatives
Used by radeonsi. local_invocation_index is lowered only when quad
derivatives are enabled.
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7586>
-rw-r--r-- | src/compiler/nir/nir.h | 2 | ||||
-rw-r--r-- | src/compiler/nir/nir_lower_system_values.c | 121 |
2 files changed, 113 insertions, 10 deletions
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 9365c163773..cc87d218ed3 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -4653,6 +4653,8 @@ bool nir_lower_system_values(nir_shader *shader); typedef struct nir_lower_compute_system_values_options { bool has_base_global_invocation_id:1; bool has_base_work_group_id:1; + bool shuffle_local_ids_for_quad_derivatives:1; + bool lower_local_invocation_index:1; } nir_lower_compute_system_values_options; bool nir_lower_compute_system_values(nir_shader *shader, diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c index c4295d52bce..6d5d9d59617 100644 --- a/src/compiler/nir/nir_lower_system_values.c +++ b/src/compiler/nir/nir_lower_system_values.c @@ -27,6 +27,17 @@ #include "nir.h" #include "nir_builder.h" +#include "util/u_math.h" +#include "util/set.h" + +struct lower_sysval_state { + const nir_lower_compute_system_values_options *options; + + /* List of intrinsics that have already been lowered and shouldn't be + * lowered again. + */ + struct set *lower_once_list; +}; static nir_ssa_def * sanitize_32bit_sysval(nir_builder *b, nir_intrinsic_instr *intrin) @@ -239,17 +250,18 @@ nir_lower_system_values(nir_shader *shader) } static bool -lower_compute_system_value_filter(const nir_instr *instr, const void *_options) +lower_compute_system_value_filter(const nir_instr *instr, const void *_state) { return instr->type == nir_instr_type_intrinsic; } static nir_ssa_def * lower_compute_system_value_instr(nir_builder *b, - nir_instr *instr, void *_options) + nir_instr *instr, void *_state) { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - const nir_lower_compute_system_values_options *options = _options; + struct lower_sysval_state *state = (struct lower_sysval_state *)_state; + const nir_lower_compute_system_values_options *options = state->options; /* All the intrinsics we care about are loads */ if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) @@ -298,15 +310,91 @@ lower_compute_system_value_instr(nir_builder *b, nir_imul(b, nir_channel(b, local_size, 0), nir_channel(b, local_size, 1))); return nir_u2u(b, nir_vec3(b, id_x, id_y, id_z), bit_size); - } else { - return NULL; } + if (options && options->shuffle_local_ids_for_quad_derivatives && + b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS && + _mesa_set_search(state->lower_once_list, instr) == NULL) { + nir_ssa_def *ids = nir_load_local_invocation_id(b); + _mesa_set_add(state->lower_once_list, ids->parent_instr); + + nir_ssa_def *x = nir_channel(b, ids, 0); + nir_ssa_def *y = nir_channel(b, ids, 1); + nir_ssa_def *z = nir_channel(b, ids, 2); + unsigned size_x = b->shader->info.cs.local_size[0]; + nir_ssa_def *size_x_imm; + + if (b->shader->info.cs.local_size_variable) + size_x_imm = nir_channel(b, nir_load_local_group_size(b), 0); + else + size_x_imm = nir_imm_int(b, size_x); + + /* Remap indices from: + * | 0| 1| 2| 3| + * | 4| 5| 6| 7| + * | 8| 9|10|11| + * |12|13|14|15| + * to: + * | 0| 1| 4| 5| + * | 2| 3| 6| 7| + * | 8| 9|12|13| + * |10|11|14|15| + * + * That's the layout required by AMD hardware for derivatives to + * work. Other hardware may work differently. + * + * It's a classic tiling pattern that can be implemented by inserting + * bit y[0] between bits x[0] and x[1] like this: + * + * x[0],y[0],x[1],...x[last],y[1],...,y[last] + * + * If the width is a power of two, use: + * i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) | ((y & ~1) << logbase2(size_x)) + * + * If the width is not a power of two or the local size is variable, use: + * i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) + ((y & ~1) * size_x) + * + * GL_NV_compute_shader_derivatives requires that the width and height + * are a multiple of two, which is also a requirement for the second + * expression to work. + * + * The 2D result is: (x,y) = (i % w, i / w) + */ + + nir_ssa_def *one = nir_imm_int(b, 1); + nir_ssa_def *inv_one = nir_imm_int(b, ~1); + nir_ssa_def *x_bit0 = nir_iand(b, x, one); + nir_ssa_def *y_bit0 = nir_iand(b, y, one); + nir_ssa_def *x_bits_1n = nir_iand(b, x, inv_one); + nir_ssa_def *y_bits_1n = nir_iand(b, y, inv_one); + nir_ssa_def *bits_01 = nir_ior(b, x_bit0, nir_ishl(b, y_bit0, one)); + nir_ssa_def *bits_01x = nir_ior(b, bits_01, + nir_ishl(b, x_bits_1n, one)); + nir_ssa_def *i; + + if (!b->shader->info.cs.local_size_variable && + util_is_power_of_two_nonzero(size_x)) { + nir_ssa_def *log2_size_x = nir_imm_int(b, util_logbase2(size_x)); + i = nir_ior(b, bits_01x, nir_ishl(b, y_bits_1n, log2_size_x)); + } else { + i = nir_iadd(b, bits_01x, nir_imul(b, y_bits_1n, size_x_imm)); + } + + /* This should be fast if size_x is an immediate or even a power + * of two. + */ + x = nir_umod(b, i, size_x_imm); + y = nir_udiv(b, i, size_x_imm); + + return nir_vec3(b, x, y, z); + } + return NULL; case nir_intrinsic_load_local_invocation_index: /* If lower_cs_local_index_from_id is true, then we derive the local * index from the local id. */ - if (b->shader->options->lower_cs_local_index_from_id) { + if (b->shader->options->lower_cs_local_index_from_id || + (options && options->lower_local_invocation_index)) { /* From the GLSL man page for gl_LocalInvocationIndex: * * "The value of gl_LocalInvocationIndex is equal to @@ -418,8 +506,21 @@ nir_lower_compute_system_values(nir_shader *shader, shader->info.stage != MESA_SHADER_KERNEL) return false; - return nir_shader_lower_instructions(shader, - lower_compute_system_value_filter, - lower_compute_system_value_instr, - (void*)options); + struct lower_sysval_state state; + state.options = options; + state.lower_once_list = _mesa_pointer_set_create(NULL); + + bool progress = + nir_shader_lower_instructions(shader, + lower_compute_system_value_filter, + lower_compute_system_value_instr, + (void*)&state); + ralloc_free(state.lower_once_list); + + /* Update this so as not to lower it again. */ + if (options && options->shuffle_local_ids_for_quad_derivatives && + shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) + shader->info.cs.derivative_group = DERIVATIVE_GROUP_LINEAR; + + return progress; } |