/* Copyright (C) 2008-2014 Free Software Foundation, Inc. This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Under Section 7 of GPL version 3, you are granted additional permissions described in the GCC Runtime Library Exception, version 3.1, as published by the Free Software Foundation. You should have received a copy of the GNU General Public License and a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ #include typedef int TItype __attribute__ ((mode (TI))); union qword_TItype { qword q; TItype t; }; inline static qword si_from_TItype (TItype t) { union qword_TItype u; u.t = t; return u.q; } inline static TItype si_to_TItype (qword q) { union qword_TItype u; u.q = q; return u.t; } /* A straight forward vectorization and unrolling of * short l[8], r[8]; * TItype total = 0; * for (i = 0; i < 8; i++) * for (j = 0; j < 8; j++) * total += (TItype)((l[7-i] * r[7-j]) << (16 * (i + j))); */ TItype __multi3 (TItype l, TItype r) { qword u = si_from_TItype (l); qword v = si_from_TItype (r); qword splat0 = si_shufb (v, v, si_ilh (0x0001)); qword splat1 = si_shufb (v, v, si_ilh (0x0203)); qword splat2 = si_shufb (v, v, si_ilh (0x0405)); qword splat3 = si_shufb (v, v, si_ilh (0x0607)); qword splat4 = si_shufb (v, v, si_ilh (0x0809)); qword splat5 = si_shufb (v, v, si_ilh (0x0a0b)); qword splat6 = si_shufb (v, v, si_ilh (0x0c0d)); qword splat7 = si_shufb (v, v, si_ilh (0x0e0f)); qword part0l = si_shlqbyi (si_mpyu (u, splat0), 14); qword part1h = si_shlqbyi (si_mpyhhu (u, splat1), 14); qword part1l = si_shlqbyi (si_mpyu (u, splat1), 12); qword part2h = si_shlqbyi (si_mpyhhu (u, splat2), 12); qword part2l = si_shlqbyi (si_mpyu (u, splat2), 10); qword part3h = si_shlqbyi (si_mpyhhu (u, splat3), 10); qword part3l = si_shlqbyi (si_mpyu (u, splat3), 8); qword part4h = si_shlqbyi (si_mpyhhu (u, splat4), 8); qword part4l = si_shlqbyi (si_mpyu (u, splat4), 6); qword part5h = si_shlqbyi (si_mpyhhu (u, splat5), 6); qword part5l = si_shlqbyi (si_mpyu (u, splat5), 4); qword part6h = si_shlqbyi (si_mpyhhu (u, splat6), 4); qword part6l = si_shlqbyi (si_mpyu (u, splat6), 2); qword part7h = si_shlqbyi (si_mpyhhu (u, splat7), 2); qword part7l = si_mpyu (u, splat7); qword carry, total0, total1, total2, total3, total4; qword total5, total6, total7, total8, total9, total10; qword total; total0 = si_a (si_a (si_a (part0l, part1h), si_a (part1l, part2h)), part7l); total1 = si_a (part2l, part3h); total2 = si_a (part3l, part4h); total3 = si_a (part4l, part5h); total4 = si_a (part5l, part6h); total5 = si_a (part6l, part7h); total6 = si_a (total0, total1); total7 = si_a (total2, total3); total8 = si_a (total4, total5); total9 = si_a (total6, total7); total10 = si_a (total8, total9); carry = si_cg (part2l, part3h); carry = si_a (carry, si_cg (part3l, part4h)); carry = si_a (carry, si_cg (part4l, part5h)); carry = si_a (carry, si_cg (part5l, part6h)); carry = si_a (carry, si_cg (part6l, part7h)); carry = si_a (carry, si_cg (total0, total1)); carry = si_a (carry, si_cg (total2, total3)); carry = si_a (carry, si_cg (total4, total5)); carry = si_a (carry, si_cg (total6, total7)); carry = si_a (carry, si_cg (total8, total9)); carry = si_shlqbyi (carry, 4); total = si_cg (total10, carry); total = si_shlqbyi (total, 4); total = si_cgx (total10, carry, total); total = si_shlqbyi (total, 4); total = si_addx (total10, carry, total); return si_to_TItype (total); }