Skip to content

Commit 94f85eb

Browse files
authored
Merge pull request #5822 from ErnstPeng/la-dev
optimize zgemm, ic/zamin and sdot lsx kernel for 2k3000 cpu
2 parents 901c214 + 4850f86 commit 94f85eb

7 files changed

Lines changed: 267 additions & 210 deletions

File tree

driver/others/parameter.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,17 @@ int get_L3_size() {
790790
return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB
791791
}
792792

793+
int get_cpu_prid() {
794+
int ret = 0, id = 0x0;
795+
__asm__ volatile (
796+
"cpucfg %[ret], %[id]"
797+
: [ret]"=r"(ret)
798+
: [id]"r"(id)
799+
: "memory"
800+
);
801+
return ret;
802+
}
803+
793804
void blas_set_parameter(void){
794805
#if defined(LA464)
795806
int L3_size = get_L3_size();
@@ -868,6 +879,18 @@ void blas_set_parameter(void){
868879
}
869880
}
870881
#endif
882+
#elif defined(LA264)
883+
int prid = get_cpu_prid();
884+
if (prid == 0x0014b020) { //2k3000
885+
886+
zgemm_p = 128;
887+
zgemm_q = 176;
888+
zgemm_r = 360;
889+
} else {
890+
zgemm_p = 64;
891+
zgemm_q = 120;
892+
zgemm_r = 4096;
893+
}
871894
#endif
872895
}
873896
#endif

kernel/loongarch64/KERNEL.LA264

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
ifndef NO_LSX
22

3-
SDOTKERNEL = dot_lsx.S
3+
SDOTKERNEL = sdot_lsx.S
44
DSDOTKERNEL = dot_lsx.S
55
DDOTKERNEL = dot_lsx.S
66
CDOTKERNEL = cdot_lsx.S

kernel/loongarch64/icamin_lsx.S

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5858
#define VI3 $vr8
5959
#define VI4 $vr19
6060
#define VT0 $vr23
61+
#define VMASK $vr7
6162

6263
PROLOGUE
6364
li.d i0, 0
@@ -76,6 +77,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7677
li.d I, -1
7778
vreplgr2vr.d VI4, I
7879
vffint.d.l VI4, VI4 // -1
80+
li.d I, 0x7fffffffffffffff // Mask for clearing the sign bit
81+
vreplgr2vr.d VMASK, I
7982
bne INCX, TEMP, .L20 // incx != 1
8083

8184
// Init Index
@@ -99,17 +102,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99102
vld VX1, X, 2 * SIZE
100103
vpickev.d x1, VX1, VX0
101104
vpickod.d x2, VX1, VX0
102-
vfmul.d x3, VI4, x1
103-
vfmul.d x4, VI4, x2
104-
vfcmp.clt.d VT0, x1, VI3
105-
vfcmp.clt.d VINC8, x2, VI3
106-
vbitsel.v x1, x1, x3, VT0
107-
vbitsel.v x2, x2, x4, VINC8
105+
vand.v x1, x1, VMASK
106+
vand.v x2, x2, VMASK
108107
vfadd.d VM0, x1, x2
109108
#else
110109
li.w I, -1
111110
vreplgr2vr.w VI4, I
112111
vffint.s.w VI4, VI4 // -1
112+
li.d I, 0x7fffffff // Mask for clearing the sign bit
113+
vreplgr2vr.w VMASK, I
113114
bne INCX, TEMP, .L20 // incx != 1
114115

115116
// Init Index
@@ -141,12 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
141142
vld VX1, X, 4 * SIZE
142143
vpickev.w x1, VX1, VX0
143144
vpickod.w x2, VX1, VX0
144-
vfmul.s x3, VI4, x1
145-
vfmul.s x4, VI4, x2
146-
vfcmp.clt.s VT0, x1, VI3
147-
vfcmp.clt.s VINC8, x2, VI3
148-
vbitsel.v x1, x1, x3, VT0
149-
vbitsel.v x2, x2, x4, VINC8
145+
vand.v x1, x1, VMASK
146+
vand.v x2, x2, VMASK
150147
vfadd.s VM0, x1, x2
151148
#endif
152149
.align 3
@@ -159,12 +156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
159156
addi.d I, I, -1
160157
vpickev.d x1, VX1, VX0
161158
vpickod.d x2, VX1, VX0
162-
vfmul.d x3, VI4, x1
163-
vfmul.d x4, VI4, x2
164-
vfcmp.clt.d VT0, x1, VI3
165-
vfcmp.clt.d VINC8, x2, VI3
166-
vbitsel.v x1, x1, x3, VT0
167-
vbitsel.v x2, x2, x4, VINC8
159+
vand.v x1, x1, VMASK
160+
vand.v x2, x2, VMASK
168161
vfadd.d x1, x1, x2
169162
vfmin.d x3, VM0, x1
170163
vfcmp.ceq.d VT0, x3, VM0
@@ -183,12 +176,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
183176
vpickev.w x1, VX1, VX0
184177
vpickod.w x2, VX1, VX0
185178
#endif
186-
VFMUL x3, VI4, x1
187-
VFMUL x4, VI4, x2
188-
VCMPLT VT0, x1, VI3
189-
VCMPLT VINC8, x2, VI3
190-
vbitsel.v x1, x1, x3, VT0
191-
vbitsel.v x2, x2, x4, VINC8
179+
vand.v x1, x1, VMASK
180+
vand.v x2, x2, VMASK
192181
VFADD x1, x1, x2
193182
VFMIN x3, VM0, x1
194183
VCMPEQ VT0, x3, VM0
@@ -264,12 +253,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
264253
vinsgr2vr.d x2, t2, 0
265254
vinsgr2vr.d x1, t3, 1
266255
vinsgr2vr.d x2, t4, 1
267-
vfmul.d x3, VI4, x1
268-
vfmul.d x4, VI4, x2
269-
vfcmp.clt.d VT0, x1, VI3
270-
vfcmp.clt.d VINC8, x2, VI3
271-
vbitsel.v x1, x1, x3, VT0
272-
vbitsel.v x2, x2, x4, VINC8
256+
vand.v x1, x1, VMASK
257+
vand.v x2, x2, VMASK
273258
vfadd.d VM0, x1, x2
274259
#else
275260
addi.w i0, i0, 1
@@ -339,12 +324,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
339324
vinsgr2vr.d x1, t3, 1
340325
vinsgr2vr.d x2, t4, 1
341326
vadd.d VI1, VI1, VINC4
342-
vfmul.d x3, VI4, x1
343-
vfmul.d x4, VI4, x2
344-
vfcmp.clt.d VT0, x1, VI3
345-
vfcmp.clt.d VINC8, x2, VI3
346-
vbitsel.v x1, x1, x3, VT0
347-
vbitsel.v x2, x2, x4, VINC8
327+
vand.v x1, x1, VMASK
328+
vand.v x2, x2, VMASK
348329
vfadd.d x1, x1, x2
349330
vfmin.d x3, VM0, x1
350331
ld.d t1, X, 0 * SIZE
@@ -385,12 +366,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
385366
vinsgr2vr.w x2, t4, 3
386367
#endif
387368
addi.d I, I, -1
388-
VFMUL x3, VI4, x1
389-
VFMUL x4, VI4, x2
390-
VCMPLT VT0, x1, VI3
391-
VCMPLT VINC8, x2, VI3
392-
vbitsel.v x1, x1, x3, VT0
393-
vbitsel.v x2, x2, x4, VINC8
369+
vand.v x1, x1, VMASK
370+
vand.v x2, x2, VMASK
394371
VFADD x1, x1, x2
395372
VFMIN x3, VM0, x1
396373
VCMPEQ VT0, x3, VM0

kernel/loongarch64/sdot_lsx.S

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
30+
#include "common.h"
31+
32+
#define N $r4
33+
#define X $r5
34+
#define INCX $r6
35+
#define Y $r7
36+
#define INCY $r8
37+
38+
#define I $r17
39+
#define TEMP $r18
40+
41+
/* Don't change following FR unless you know the effects. */
42+
#define s1 $f8
43+
#define s2 $f9
44+
#define a1 $f10
45+
#define b1 $f11
46+
47+
PROLOGUE
48+
49+
vxor.v $vr8, $vr8, $vr8
50+
vxor.v $vr9, $vr9, $vr9
51+
slli.d INCX, INCX, BASE_SHIFT
52+
li.d TEMP, SIZE
53+
slli.d INCY, INCY, BASE_SHIFT
54+
bge $r0, N, .L999
55+
bne INCX, TEMP, .L20 /* inc_x=1 */
56+
bne INCY, TEMP, .L20 /* inc_y=1 */
57+
58+
/* ((inc_x == 1) && (inc_y == 1)) */
59+
srai.d I, N, 4
60+
bge $r0, I, .L12 /* FLOAT: <16 */
61+
.L11:
62+
/* FLOAT: 16~ */
63+
vld $vr0, X, 0
64+
vld $vr1, X, 16
65+
vld $vr2, X, 32
66+
vld $vr3, X, 48
67+
vld $vr4, Y, 0
68+
vld $vr5, Y, 16
69+
vld $vr6, Y, 32
70+
vld $vr7, Y, 48
71+
addi.d I, I, -1
72+
addi.d X, X, 64
73+
addi.d Y, Y, 64
74+
75+
vfmadd.s $vr8, $vr0, $vr4, $vr8
76+
vfmadd.s $vr9, $vr1, $vr5, $vr9
77+
vfmadd.s $vr8, $vr2, $vr6, $vr8
78+
vfmadd.s $vr9, $vr3, $vr7, $vr9
79+
80+
bnez I, .L11
81+
82+
vfadd.s $vr8, $vr8, $vr9
83+
vextrins.w $vr1, $vr8, 0x01
84+
vextrins.w $vr2, $vr8, 0x02
85+
vextrins.w $vr3, $vr8, 0x03
86+
fadd.s $f8, $f8, $f1
87+
fadd.s $f8, $f8, $f2
88+
fadd.s $f8, $f8, $f3
89+
.L12:
90+
andi I, N, 0xf
91+
bge $r0, I, .L999
92+
.L13:
93+
addi.d I, I, -1
94+
fld.s $f0, X, 0
95+
fld.s $f4, Y, 0
96+
addi.d X, X, 4
97+
addi.d Y, Y, 4
98+
fmadd.s $f8, $f0, $f4, $f8
99+
bnez I, .L13
100+
b .L999
101+
102+
/* !((inc_x == 1) && (inc_y == 1)) */
103+
.L20:
104+
move I, N
105+
.L21:
106+
addi.d I, I, -1
107+
108+
fld.s $f0, X, 0
109+
fld.s $f4, Y, 0
110+
add.d X, X, INCX
111+
add.d Y, Y, INCY
112+
fmadd.s $f8, $f0, $f4, $f8
113+
114+
bnez I, .L21
115+
b .L999
116+
117+
.L999:
118+
fmov.s $f0, $f8
119+
jirl $r0, $r1, 0x0
120+
121+
EPILOGUE

0 commit comments

Comments
 (0)