Skip to content

Commit fb9b946

Browse files
committed
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64
Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <[email protected]> Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Meidan Li <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent 4c8ab99 commit fb9b946

File tree

18 files changed

+419
-221
lines changed

18 files changed

+419
-221
lines changed

src/cmd/compile/internal/ir/symtab.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ type symsStruct struct {
6161
ARM64HasATOMICS *obj.LSym
6262
ARMHasVFPv4 *obj.LSym
6363
Loong64HasLAM_BH *obj.LSym
64+
Loong64HasLSX *obj.LSym
6465
X86HasFMA *obj.LSym
6566
X86HasPOPCNT *obj.LSym
6667
X86HasSSE41 *obj.LSym

src/cmd/compile/internal/loong64/ssa.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
493493
}
494494
}
495495
fallthrough
496+
496497
case ssa.OpLOONG64MOVWF,
497498
ssa.OpLOONG64MOVWD,
498499
ssa.OpLOONG64TRUNCFW,
@@ -525,6 +526,16 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
525526
p.From.Reg = v.Args[0].Reg()
526527
p.To.Type = obj.TYPE_REG
527528
p.To.Reg = v.Reg()
529+
530+
case ssa.OpLOONG64VPCNT64,
531+
ssa.OpLOONG64VPCNT32,
532+
ssa.OpLOONG64VPCNT16:
533+
p := s.Prog(v.Op.Asm())
534+
p.From.Type = obj.TYPE_REG
535+
p.From.Reg = ((v.Args[0].Reg() - loong64.REG_F0) & 31) + loong64.REG_V0
536+
p.To.Type = obj.TYPE_REG
537+
p.To.Reg = ((v.Reg() - loong64.REG_F0) & 31) + loong64.REG_V0
538+
528539
case ssa.OpLOONG64NEGV:
529540
// SUB from REGZERO
530541
p := s.Prog(loong64.ASUBVU)
@@ -533,6 +544,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
533544
p.Reg = loong64.REGZERO
534545
p.To.Type = obj.TYPE_REG
535546
p.To.Reg = v.Reg()
547+
536548
case ssa.OpLOONG64DUFFZERO:
537549
// runtime.duffzero expects start address in R20
538550
p := s.Prog(obj.ADUFFZERO)

src/cmd/compile/internal/ssa/_gen/LOONG64.rules

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,10 @@
153153
(BitRev32 ...) => (BITREVW ...)
154154
(BitRev64 ...) => (BITREVV ...)
155155

156+
(PopCount64 <t> x) => (MOVVfpgp <t> (VPCNT64 <typ.Float64> (MOVVgpfp <typ.Float64> x)))
157+
(PopCount32 <t> x) => (MOVWfpgp <t> (VPCNT32 <typ.Float32> (MOVWgpfp <typ.Float32> x)))
158+
(PopCount16 <t> x) => (MOVWfpgp <t> (VPCNT16 <typ.Float32> (MOVWgpfp <typ.Float32> (ZeroExt16to32 x))))
159+
156160
// math package intrinsics
157161
(Sqrt ...) => (SQRTD ...)
158162
(Sqrt32 ...) => (SQRTF ...)

src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,31 @@ func init() {
162162
readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
163163
)
164164
ops := []opData{
165+
// unary ops
166+
{name: "NEGV", argLength: 1, reg: gp11}, // -arg0
167+
{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
168+
{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
169+
170+
{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
171+
{name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
172+
173+
{name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64
174+
175+
{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // Count leading (high order) zeroes (returns 0-32)
176+
{name: "CLZV", argLength: 1, reg: gp11, asm: "CLZV"}, // Count leading (high order) zeroes (returns 0-64)
177+
178+
{name: "REVB2H", argLength: 1, reg: gp11, asm: "REVB2H"}, // Swap bytes: 0x11223344 -> 0x22114433 (sign extends to 64 bits)
179+
{name: "REVB2W", argLength: 1, reg: gp11, asm: "REVB2W"}, // Swap bytes: 0x1122334455667788 -> 0x4433221188776655
180+
{name: "REVBV", argLength: 1, reg: gp11, asm: "REVBV"}, // Swap bytes: 0x1122334455667788 -> 0x8877665544332211
181+
182+
{name: "BITREV4B", argLength: 1, reg: gp11, asm: "BITREV4B"}, // Reverse the bits of each byte inside a 32-bit arg[0]
183+
{name: "BITREVW", argLength: 1, reg: gp11, asm: "BITREVW"}, // Reverse the bits in a 32-bit arg[0]
184+
{name: "BITREVV", argLength: 1, reg: gp11, asm: "BITREVV"}, // Reverse the bits in a 64-bit arg[0]
185+
186+
{name: "VPCNT64", argLength: 1, reg: fp11, asm: "VPCNTV"}, // count set bits for each 64-bit unit and store the result in each 64-bit unit
187+
{name: "VPCNT32", argLength: 1, reg: fp11, asm: "VPCNTW"}, // count set bits for each 32-bit unit and store the result in each 32-bit unit
188+
{name: "VPCNT16", argLength: 1, reg: fp11, asm: "VPCNTH"}, // count set bits for each 16-bit unit and store the result in each 16-bit unit
189+
165190
// binary ops
166191
{name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true}, // arg0 + arg1
167192
{name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"}, // arg0 + auxInt. auxInt is 32-bit, also in other *const ops.
@@ -203,32 +228,13 @@ func init() {
203228
{name: "FNMSUBF", argLength: 3, reg: fp31, asm: "FNMSUBF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) - arg2)
204229
{name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) - arg2)
205230

206-
{name: "NEGV", argLength: 1, reg: gp11}, // -arg0
207-
{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
208-
{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
209-
{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
210-
{name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
211-
212-
{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // Count leading (high order) zeroes (returns 0-32)
213-
{name: "CLZV", argLength: 1, reg: gp11, asm: "CLZV"}, // Count leading (high order) zeroes (returns 0-64)
214-
215-
{name: "REVB2H", argLength: 1, reg: gp11, asm: "REVB2H"}, // Swap bytes: 0x11223344 -> 0x22114433 (sign extends to 64 bits)
216-
{name: "REVB2W", argLength: 1, reg: gp11, asm: "REVB2W"}, // Swap bytes: 0x1122334455667788 -> 0x4433221188776655
217-
{name: "REVBV", argLength: 1, reg: gp11, asm: "REVBV"}, // Swap bytes: 0x1122334455667788 -> 0x8877665544332211
218-
219-
{name: "BITREV4B", argLength: 1, reg: gp11, asm: "BITREV4B"}, // Reverse the bits of each byte inside a 32-bit arg[0]
220-
{name: "BITREVW", argLength: 1, reg: gp11, asm: "BITREVW"}, // Reverse the bits in a 32-bit arg[0]
221-
{name: "BITREVV", argLength: 1, reg: gp11, asm: "BITREVV"}, // Reverse the bits in a 64-bit arg[0]
222-
223231
{name: "FMINF", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMINF", commutative: true, typ: "Float32"}, // min(arg0, arg1), float32
224232
{name: "FMIND", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMIND", commutative: true, typ: "Float64"}, // min(arg0, arg1), float64
225233
{name: "FMAXF", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXF", commutative: true, typ: "Float32"}, // max(arg0, arg1), float32
226234
{name: "FMAXD", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXD", commutative: true, typ: "Float64"}, // max(arg0, arg1), float64
227235

228-
{name: "MASKEQZ", argLength: 2, reg: gp21, asm: "MASKEQZ"}, // returns 0 if arg1 == 0, otherwise returns arg0
229-
{name: "MASKNEZ", argLength: 2, reg: gp21, asm: "MASKNEZ"}, // returns 0 if arg1 != 0, otherwise returns arg0
230-
231-
{name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64
236+
{name: "MASKEQZ", argLength: 2, reg: gp21, asm: "MASKEQZ"}, // returns 0 if arg1 == 0, otherwise returns arg0
237+
{name: "MASKNEZ", argLength: 2, reg: gp21, asm: "MASKNEZ"}, // returns 0 if arg1 != 0, otherwise returns arg0
232238
{name: "FCOPYSGD", argLength: 2, reg: fp21, asm: "FCOPYSGD"}, // float64
233239

234240
// shifts

0 commit comments

Comments
 (0)