nir: optionally shuffle local invocation IDs for compute quad derivatives

Used by radeonsi. local_invocation_index is lowered only when quad derivatives are enabled. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7586>
author: Marek Olšák <marek.olsak@amd.com> 2020-09-21 07:33:59 -0400
committer: Marge Bot <eric+marge@anholt.net> 2020-11-12 21:02:05 +0000
commit: 96c12b7dc20d05dff94a947851f08d9ccbfb72ad (patch)
tree: 347634842cca1dd71c22225a60065ddc9505c91d
parent: 99e17b0c4adb81f93ba9b98b754ac71f6f334c3c (diff)
download: external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.gz
external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.bz2
external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.zip
2 files changed, 113 insertions, 10 deletions
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 9365c163773..cc87d218ed3 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4653,6 +4653,8 @@ bool nir_lower_system_values(nir_shader *shader);
 typedef struct nir_lower_compute_system_values_options {
    bool has_base_global_invocation_id:1;
    bool has_base_work_group_id:1;
+   bool shuffle_local_ids_for_quad_derivatives:1;
+   bool lower_local_invocation_index:1;
 } nir_lower_compute_system_values_options;
 
 bool nir_lower_compute_system_values(nir_shader *shader,
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index c4295d52bce..6d5d9d59617 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -27,6 +27,17 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "util/u_math.h"
+#include "util/set.h"
+
+struct lower_sysval_state {
+   const nir_lower_compute_system_values_options *options;
+
+   /* List of intrinsics that have already been lowered and shouldn't be
+    * lowered again.
+    */
+   struct set *lower_once_list;
+};
 
 static nir_ssa_def *
 sanitize_32bit_sysval(nir_builder *b, nir_intrinsic_instr *intrin)
@@ -239,17 +250,18 @@ nir_lower_system_values(nir_shader *shader)
 }
 
 static bool
-lower_compute_system_value_filter(const nir_instr *instr, const void *_options)
+lower_compute_system_value_filter(const nir_instr *instr, const void *_state)
 {
    return instr->type == nir_instr_type_intrinsic;
 }
 
 static nir_ssa_def *
 lower_compute_system_value_instr(nir_builder *b,
-                                 nir_instr *instr, void *_options)
+                                 nir_instr *instr, void *_state)
 {
    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-   const nir_lower_compute_system_values_options *options = _options;
+   struct lower_sysval_state *state = (struct lower_sysval_state *)_state;
+   const nir_lower_compute_system_values_options *options = state->options;
 
    /* All the intrinsics we care about are loads */
    if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)
@@ -298,15 +310,91 @@ lower_compute_system_value_instr(nir_builder *b,
                             nir_imul(b, nir_channel(b, local_size, 0),
                                         nir_channel(b, local_size, 1)));
          return nir_u2u(b, nir_vec3(b, id_x, id_y, id_z), bit_size);
-      } else {
-         return NULL;
       }
+      if (options && options->shuffle_local_ids_for_quad_derivatives &&
+          b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
+          _mesa_set_search(state->lower_once_list, instr) == NULL) {
+         nir_ssa_def *ids = nir_load_local_invocation_id(b);
+         _mesa_set_add(state->lower_once_list, ids->parent_instr);
+
+         nir_ssa_def *x = nir_channel(b, ids, 0);
+         nir_ssa_def *y = nir_channel(b, ids, 1);
+         nir_ssa_def *z = nir_channel(b, ids, 2);
+         unsigned size_x = b->shader->info.cs.local_size[0];
+         nir_ssa_def *size_x_imm;
+
+         if (b->shader->info.cs.local_size_variable)
+            size_x_imm = nir_channel(b, nir_load_local_group_size(b), 0);
+         else
+            size_x_imm = nir_imm_int(b, size_x);
+
+         /* Remap indices from:
+          *    | 0| 1| 2| 3|
+          *    | 4| 5| 6| 7|
+          *    | 8| 9|10|11|
+          *    |12|13|14|15|
+          * to:
+          *    | 0| 1| 4| 5|
+          *    | 2| 3| 6| 7|
+          *    | 8| 9|12|13|
+          *    |10|11|14|15|
+          *
+          * That's the layout required by AMD hardware for derivatives to
+          * work. Other hardware may work differently.
+          *
+          * It's a classic tiling pattern that can be implemented by inserting
+          * bit y[0] between bits x[0] and x[1] like this:
+          *
+          *    x[0],y[0],x[1],...x[last],y[1],...,y[last]
+          *
+          * If the width is a power of two, use:
+          *    i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) | ((y & ~1) << logbase2(size_x))
+          *
+          * If the width is not a power of two or the local size is variable, use:
+          *    i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) + ((y & ~1) * size_x)
+          *
+          * GL_NV_compute_shader_derivatives requires that the width and height
+          * are a multiple of two, which is also a requirement for the second
+          * expression to work.
+          *
+          * The 2D result is: (x,y) = (i % w, i / w)
+          */
+
+         nir_ssa_def *one = nir_imm_int(b, 1);
+         nir_ssa_def *inv_one = nir_imm_int(b, ~1);
+         nir_ssa_def *x_bit0 = nir_iand(b, x, one);
+         nir_ssa_def *y_bit0 = nir_iand(b, y, one);
+         nir_ssa_def *x_bits_1n = nir_iand(b, x, inv_one);
+         nir_ssa_def *y_bits_1n = nir_iand(b, y, inv_one);
+         nir_ssa_def *bits_01 = nir_ior(b, x_bit0, nir_ishl(b, y_bit0, one));
+         nir_ssa_def *bits_01x = nir_ior(b, bits_01,
+                                         nir_ishl(b, x_bits_1n, one));
+         nir_ssa_def *i;
+
+         if (!b->shader->info.cs.local_size_variable &&
+             util_is_power_of_two_nonzero(size_x)) {
+            nir_ssa_def *log2_size_x = nir_imm_int(b, util_logbase2(size_x));
+            i = nir_ior(b, bits_01x, nir_ishl(b, y_bits_1n, log2_size_x));
+         } else {
+            i = nir_iadd(b, bits_01x, nir_imul(b, y_bits_1n, size_x_imm));
+         }
+
+         /* This should be fast if size_x is an immediate or even a power
+          * of two.
+          */
+         x = nir_umod(b, i, size_x_imm);
+         y = nir_udiv(b, i, size_x_imm);
+
+         return nir_vec3(b, x, y, z);
+      }
+      return NULL;
 
    case nir_intrinsic_load_local_invocation_index:
       /* If lower_cs_local_index_from_id is true, then we derive the local
        * index from the local id.
        */
-      if (b->shader->options->lower_cs_local_index_from_id) {
+      if (b->shader->options->lower_cs_local_index_from_id ||
+          (options && options->lower_local_invocation_index)) {
          /* From the GLSL man page for gl_LocalInvocationIndex:
           *
           *    "The value of gl_LocalInvocationIndex is equal to
@@ -418,8 +506,21 @@ nir_lower_compute_system_values(nir_shader *shader,
        shader->info.stage != MESA_SHADER_KERNEL)
       return false;
 
-   return nir_shader_lower_instructions(shader,
-                                        lower_compute_system_value_filter,
-                                        lower_compute_system_value_instr,
-                                        (void*)options);
+   struct lower_sysval_state state;
+   state.options = options;
+   state.lower_once_list = _mesa_pointer_set_create(NULL);
+
+   bool progress =
+      nir_shader_lower_instructions(shader,
+                                    lower_compute_system_value_filter,
+                                    lower_compute_system_value_instr,
+                                    (void*)&state);
+   ralloc_free(state.lower_once_list);
+
+   /* Update this so as not to lower it again. */
+   if (options && options->shuffle_local_ids_for_quad_derivatives &&
+       shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS)
+      shader->info.cs.derivative_group = DERIVATIVE_GROUP_LINEAR;
+
+   return progress;
 }
author	Marek Olšák <marek.olsak@amd.com>	2020-09-21 07:33:59 -0400
committer	Marge Bot <eric+marge@anholt.net>	2020-11-12 21:02:05 +0000
commit	96c12b7dc20d05dff94a947851f08d9ccbfb72ad (patch)
tree	347634842cca1dd71c22225a60065ddc9505c91d
parent	99e17b0c4adb81f93ba9b98b754ac71f6f334c3c (diff)
download	external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.gz external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.tar.bz2 external_mesa3d-96c12b7dc20d05dff94a947851f08d9ccbfb72ad.zip