From 21174dedec43df155309653764d76b4acd4d5f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 16 Sep 2020 21:48:18 -0400 Subject: nir: split fuse_ffma into fuse_ffma16/32/64 AMD wants different behavior for each bit size Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/compiler/nir/nir.h | 4 +++- src/compiler/nir/nir_opt_algebraic.py | 8 ++++++-- src/freedreno/ir3/ir3_nir.c | 8 ++++++-- src/gallium/drivers/etnaviv/etnaviv_screen.c | 4 +++- src/gallium/drivers/freedreno/a2xx/ir2_nir.c | 4 +++- src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp | 4 +++- src/gallium/drivers/nouveau/nv50/nv50_screen.c | 4 +++- src/gallium/drivers/r600/r600_pipe_common.c | 8 ++++++-- src/gallium/drivers/radeonsi/si_get.c | 4 +++- src/panfrost/bifrost/bifrost_compile.h | 4 +++- 10 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index d3ba1bf0f00..d048cb35b5d 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3056,7 +3056,9 @@ typedef enum { typedef struct nir_shader_compiler_options { bool lower_fdiv; bool lower_ffma; - bool fuse_ffma; + bool fuse_ffma16; + bool fuse_ffma32; + bool fuse_ffma64; bool lower_flrp16; bool lower_flrp32; /** Lowers flrp when it does not support doubles */ diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 39c07ce0b7f..a03cc549e5f 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -195,7 +195,9 @@ optimizations.extend([ (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). - (('~ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma'), + (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), + (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), + (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), @@ -2028,7 +2030,9 @@ late_optimizations = [ (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), (('ineg', a), ('isub', 0, a), 'options->lower_negate'), (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), - (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'), + (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), + (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), + (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), # These are duplicated from the main optimizations table. The late # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 308d42542f2..f33f048a505 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -47,7 +47,9 @@ static const nir_shader_compiler_options options = { .lower_usub_borrow = true, .lower_mul_high = true, .lower_mul_2x32_64 = true, - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, .vertex_id_zero_based = true, .lower_extract_byte = true, .lower_extract_word = true, @@ -97,7 +99,9 @@ static const nir_shader_compiler_options options_a6xx = { .lower_usub_borrow = true, .lower_mul_high = true, .lower_mul_2x32_64 = true, - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, .vertex_id_zero_based = false, .lower_extract_byte = true, .lower_extract_word = true, diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c index 1637eaa3f5f..3cc7dcec148 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_screen.c +++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c @@ -1004,7 +1004,9 @@ etna_screen_create(struct etna_device *dev, struct etna_gpu *gpu, .lower_fpow = true, .lower_sub = true, .lower_ftrunc = true, - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, .lower_bitops = true, .lower_all_io_to_temps = true, .vertex_id_zero_based = true, diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c index 58e99f0943b..6cf95d5f4e3 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c @@ -35,7 +35,9 @@ static const nir_shader_compiler_options options = { .lower_fmod = true, .lower_fdiv = true, .lower_fceil = true, - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, /* .fdot_replicates = true, it is replicated, but it makes things worse */ .lower_all_io_to_temps = true, .vertex_id_zero_based = true, /* its not implemented anyway */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp index c466d249e58..c5e54779ad7 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp @@ -3207,7 +3207,9 @@ nvir_nir_shader_compiler_options(int chipset) nir_shader_compiler_options op = {}; op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET); op.lower_ffma = false; - op.fuse_ffma = false; /* nir doesn't track mad vs fma */ + op.fuse_ffma16 = false; /* nir doesn't track mad vs fma */ + op.fuse_ffma32 = false; /* nir doesn't track mad vs fma */ + op.fuse_ffma64 = false; /* nir doesn't track mad vs fma */ op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET); op.lower_flrp32 = true; op.lower_flrp64 = true; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 06f73ab6be2..29201eee649 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -923,7 +923,9 @@ int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space) } static const nir_shader_compiler_options nir_options = { - .fuse_ffma = false, /* nir doesn't track mad vs fma */ + .fuse_ffma16 = false, /* nir doesn't track mad vs fma */ + .fuse_ffma32 = false, /* nir doesn't track mad vs fma */ + .fuse_ffma64 = false, /* nir doesn't track mad vs fma */ .lower_flrp32 = true, .lower_flrp64 = true, .lower_fpow = false, diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c index b4dffa76a11..eab7ce91dd7 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.c +++ b/src/gallium/drivers/r600/r600_pipe_common.c @@ -1179,7 +1179,9 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, } const struct nir_shader_compiler_options r600_nir_fs_options = { - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, .lower_scmp = true, .lower_flrp32 = true, .lower_flrp64 = true, @@ -1203,7 +1205,9 @@ const struct nir_shader_compiler_options r600_nir_fs_options = { }; const struct nir_shader_compiler_options r600_nir_options = { - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, .lower_scmp = true, .lower_flrp32 = true, .lower_flrp64 = true, diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index a511bab4f0d..1f19fdd8817 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -946,7 +946,9 @@ void si_init_screen_get_functions(struct si_screen *sscreen) * for gfx10.3 on gfx10. */ .lower_ffma = sscreen->info.chip_class <= GFX9, - .fuse_ffma = sscreen->info.chip_class >= GFX10, + .fuse_ffma16 = sscreen->info.chip_class >= GFX10, + .fuse_ffma32 = sscreen->info.chip_class >= GFX10, + .fuse_ffma64 = sscreen->info.chip_class >= GFX10, .lower_fmod = true, .lower_pack_snorm_4x8 = true, .lower_pack_unorm_4x8 = true, diff --git a/src/panfrost/bifrost/bifrost_compile.h b/src/panfrost/bifrost/bifrost_compile.h index cc28b42c3af..15b90788133 100644 --- a/src/panfrost/bifrost/bifrost_compile.h +++ b/src/panfrost/bifrost/bifrost_compile.h @@ -69,7 +69,9 @@ static const nir_shader_compiler_options bifrost_nir_options = { .lower_bitfield_extract_to_shifts = true, .vectorize_io = true, - .fuse_ffma = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, .use_interpolated_input_intrinsics = true }; -- cgit v1.2.3