LoongArch: Align FPU register state to 32 bytes

Move fpr to the beginning of struct loongarch_fpu so it is naturally
aligned to FPU_ALIGN (32 bytes), improving 256-bit SIMD (LASX) context
switch performance.

Also adjust process.c and fpu.S to work well with the new loongarch_fpu
layout.

Signed-off-by: Lisa Robinson <lisa@bytefly.space>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
This commit is contained in:
Lisa Robinson
2026-04-22 15:45:11 +08:00
committed by Huacai Chen
parent 1829419bc3
commit e3f4591f79
3 changed files with 9 additions and 7 deletions

View File

@@ -80,10 +80,10 @@ BUILD_FPR_ACCESS(32)
BUILD_FPR_ACCESS(64)
struct loongarch_fpu {
union fpureg fpr[NUM_FPU_REGS];
uint64_t fcc; /* 8x8 */
uint32_t fcsr;
uint32_t ftop;
union fpureg fpr[NUM_FPU_REGS];
};
struct loongarch_lbt {

View File

@@ -97,7 +97,7 @@
.endm
#ifdef CONFIG_32BIT
.macro sc_save_fcc thread tmp0 tmp1
.macro sc_save_fcc base tmp0 tmp1
movcf2gr \tmp0, $fcc0
move \tmp1, \tmp0
movcf2gr \tmp0, $fcc1
@@ -106,7 +106,7 @@
bstrins.w \tmp1, \tmp0, 23, 16
movcf2gr \tmp0, $fcc3
bstrins.w \tmp1, \tmp0, 31, 24
EX st.w \tmp1, \thread, THREAD_FCC
EX st.w \tmp1, \base, 0
movcf2gr \tmp0, $fcc4
move \tmp1, \tmp0
movcf2gr \tmp0, $fcc5
@@ -115,11 +115,11 @@
bstrins.w \tmp1, \tmp0, 23, 16
movcf2gr \tmp0, $fcc7
bstrins.w \tmp1, \tmp0, 31, 24
EX st.w \tmp1, \thread, (THREAD_FCC + 4)
EX st.w \tmp1, \base, 4
.endm
.macro sc_restore_fcc thread tmp0 tmp1
EX ld.w \tmp0, \thread, THREAD_FCC
.macro sc_restore_fcc base tmp0 tmp1
EX ld.w \tmp0, \base, 0
bstrpick.w \tmp1, \tmp0, 7, 0
movgr2cf $fcc0, \tmp1
bstrpick.w \tmp1, \tmp0, 15, 8
@@ -128,7 +128,7 @@
movgr2cf $fcc2, \tmp1
bstrpick.w \tmp1, \tmp0, 31, 24
movgr2cf $fcc3, \tmp1
EX ld.w \tmp0, \thread, (THREAD_FCC + 4)
EX ld.w \tmp0, \base, 4
bstrpick.w \tmp1, \tmp0, 7, 0
movgr2cf $fcc4, \tmp1
bstrpick.w \tmp1, \tmp0, 15, 8

View File

@@ -135,6 +135,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
return 0;
}
dst->thread.fpu.fcsr = src->thread.fpu.fcsr;
if (!used_math())
memcpy(dst, src, offsetof(struct task_struct, thread.fpu.fpr));
else