aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2020-09-21 07:33:59 -0400
committerMarge Bot <eric+marge@anholt.net>2020-11-12 21:02:05 +0000
commit96c12b7dc20d05dff94a947851f08d9ccbfb72ad (patch)
tree347634842cca1dd71c22225a60065ddc9505c91d
parent99e17b0c4adb81f93ba9b98b754ac71f6f334c3c (diff)
downloadexternal_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.gz
external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.bz2
external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.zip
nir: optionally shuffle local invocation IDs for compute quad derivatives
Used by radeonsi. local_invocation_index is lowered only when quad derivatives are enabled. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7586>
-rw-r--r--src/compiler/nir/nir.h2
-rw-r--r--src/compiler/nir/nir_lower_system_values.c121
2 files changed, 113 insertions, 10 deletions
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 9365c163773..cc87d218ed3 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4653,6 +4653,8 @@ bool nir_lower_system_values(nir_shader *shader);
typedef struct nir_lower_compute_system_values_options {
bool has_base_global_invocation_id:1;
bool has_base_work_group_id:1;
+ bool shuffle_local_ids_for_quad_derivatives:1;
+ bool lower_local_invocation_index:1;
} nir_lower_compute_system_values_options;
bool nir_lower_compute_system_values(nir_shader *shader,
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index c4295d52bce..6d5d9d59617 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -27,6 +27,17 @@
#include "nir.h"
#include "nir_builder.h"
+#include "util/u_math.h"
+#include "util/set.h"
+
+struct lower_sysval_state {
+ const nir_lower_compute_system_values_options *options;
+
+ /* List of intrinsics that have already been lowered and shouldn't be
+ * lowered again.
+ */
+ struct set *lower_once_list;
+};
static nir_ssa_def *
sanitize_32bit_sysval(nir_builder *b, nir_intrinsic_instr *intrin)
@@ -239,17 +250,18 @@ nir_lower_system_values(nir_shader *shader)
}
static bool
-lower_compute_system_value_filter(const nir_instr *instr, const void *_options)
+lower_compute_system_value_filter(const nir_instr *instr, const void *_state)
{
return instr->type == nir_instr_type_intrinsic;
}
static nir_ssa_def *
lower_compute_system_value_instr(nir_builder *b,
- nir_instr *instr, void *_options)
+ nir_instr *instr, void *_state)
{
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
- const nir_lower_compute_system_values_options *options = _options;
+ struct lower_sysval_state *state = (struct lower_sysval_state *)_state;
+ const nir_lower_compute_system_values_options *options = state->options;
/* All the intrinsics we care about are loads */
if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)
@@ -298,15 +310,91 @@ lower_compute_system_value_instr(nir_builder *b,
nir_imul(b, nir_channel(b, local_size, 0),
nir_channel(b, local_size, 1)));
return nir_u2u(b, nir_vec3(b, id_x, id_y, id_z), bit_size);
- } else {
- return NULL;
}
+ if (options && options->shuffle_local_ids_for_quad_derivatives &&
+ b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
+ _mesa_set_search(state->lower_once_list, instr) == NULL) {
+ nir_ssa_def *ids = nir_load_local_invocation_id(b);
+ _mesa_set_add(state->lower_once_list, ids->parent_instr);
+
+ nir_ssa_def *x = nir_channel(b, ids, 0);
+ nir_ssa_def *y = nir_channel(b, ids, 1);
+ nir_ssa_def *z = nir_channel(b, ids, 2);
+ unsigned size_x = b->shader->info.cs.local_size[0];
+ nir_ssa_def *size_x_imm;
+
+ if (b->shader->info.cs.local_size_variable)
+ size_x_imm = nir_channel(b, nir_load_local_group_size(b), 0);
+ else
+ size_x_imm = nir_imm_int(b, size_x);
+
+ /* Remap indices from:
+ * | 0| 1| 2| 3|
+ * | 4| 5| 6| 7|
+ * | 8| 9|10|11|
+ * |12|13|14|15|
+ * to:
+ * | 0| 1| 4| 5|
+ * | 2| 3| 6| 7|
+ * | 8| 9|12|13|
+ * |10|11|14|15|
+ *
+ * That's the layout required by AMD hardware for derivatives to
+ * work. Other hardware may work differently.
+ *
+ * It's a classic tiling pattern that can be implemented by inserting
+ * bit y[0] between bits x[0] and x[1] like this:
+ *
+ * x[0],y[0],x[1],...x[last],y[1],...,y[last]
+ *
+ * If the width is a power of two, use:
+ * i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) | ((y & ~1) << logbase2(size_x))
+ *
+ * If the width is not a power of two or the local size is variable, use:
+ * i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) + ((y & ~1) * size_x)
+ *
+ * GL_NV_compute_shader_derivatives requires that the width and height
+ * are a multiple of two, which is also a requirement for the second
+ * expression to work.
+ *
+ * The 2D result is: (x,y) = (i % w, i / w)
+ */
+
+ nir_ssa_def *one = nir_imm_int(b, 1);
+ nir_ssa_def *inv_one = nir_imm_int(b, ~1);
+ nir_ssa_def *x_bit0 = nir_iand(b, x, one);
+ nir_ssa_def *y_bit0 = nir_iand(b, y, one);
+ nir_ssa_def *x_bits_1n = nir_iand(b, x, inv_one);
+ nir_ssa_def *y_bits_1n = nir_iand(b, y, inv_one);
+ nir_ssa_def *bits_01 = nir_ior(b, x_bit0, nir_ishl(b, y_bit0, one));
+ nir_ssa_def *bits_01x = nir_ior(b, bits_01,
+ nir_ishl(b, x_bits_1n, one));
+ nir_ssa_def *i;
+
+ if (!b->shader->info.cs.local_size_variable &&
+ util_is_power_of_two_nonzero(size_x)) {
+ nir_ssa_def *log2_size_x = nir_imm_int(b, util_logbase2(size_x));
+ i = nir_ior(b, bits_01x, nir_ishl(b, y_bits_1n, log2_size_x));
+ } else {
+ i = nir_iadd(b, bits_01x, nir_imul(b, y_bits_1n, size_x_imm));
+ }
+
+ /* This should be fast if size_x is an immediate or even a power
+ * of two.
+ */
+ x = nir_umod(b, i, size_x_imm);
+ y = nir_udiv(b, i, size_x_imm);
+
+ return nir_vec3(b, x, y, z);
+ }
+ return NULL;
case nir_intrinsic_load_local_invocation_index:
/* If lower_cs_local_index_from_id is true, then we derive the local
* index from the local id.
*/
- if (b->shader->options->lower_cs_local_index_from_id) {
+ if (b->shader->options->lower_cs_local_index_from_id ||
+ (options && options->lower_local_invocation_index)) {
/* From the GLSL man page for gl_LocalInvocationIndex:
*
* "The value of gl_LocalInvocationIndex is equal to
@@ -418,8 +506,21 @@ nir_lower_compute_system_values(nir_shader *shader,
shader->info.stage != MESA_SHADER_KERNEL)
return false;
- return nir_shader_lower_instructions(shader,
- lower_compute_system_value_filter,
- lower_compute_system_value_instr,
- (void*)options);
+ struct lower_sysval_state state;
+ state.options = options;
+ state.lower_once_list = _mesa_pointer_set_create(NULL);
+
+ bool progress =
+ nir_shader_lower_instructions(shader,
+ lower_compute_system_value_filter,
+ lower_compute_system_value_instr,
+ (void*)&state);
+ ralloc_free(state.lower_once_list);
+
+ /* Update this so as not to lower it again. */
+ if (options && options->shuffle_local_ids_for_quad_derivatives &&
+ shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS)
+ shader->info.cs.derivative_group = DERIVATIVE_GROUP_LINEAR;
+
+ return progress;
}