From 4c4925c1f4ccd72002957c3e73b4f117f2bcf712 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 11 Jun 2008 12:40:13 -0400 Subject: [ARM] fix cache alignment code in memset.S This code is currently disabled, which explains why no one was affected. Signed-off-by: Nicolas Pitre Signed-off-by: Lennert Buytenhek --- arch/arm/lib/memmove.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm/lib') diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index ef7fddc14ac9..018522c3ff26 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S @@ -60,6 +60,7 @@ ENTRY(memmove) CALGN( bcs 2f ) CALGN( adr r4, 6f ) CALGN( subs r2, r2, ip ) @ C is set here + CALGN( rsb ip, ip, #32 ) CALGN( add pc, r4, ip ) PLD( pld [r1, #-4] ) @@ -139,7 +140,6 @@ ENTRY(memmove) blt 14f CALGN( ands ip, r1, #31 ) - CALGN( rsb ip, ip, #32 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) -- cgit From 2239aff6ab2b95af1f628eee7a809f21c41605b3 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 31 Mar 2008 12:38:31 -0400 Subject: [ARM] cache align destination pointer when copying memory for some processors The implementation for memory copy functions on ARM had a (disabled) provision for aligning the source pointer before loading registers with data. Turns out that aligning the _destination_ pointer is much more useful, as the read side is already sufficiently helped with the use of preload. So this changes the definition of the CALGN() macro to target the destination pointer instead, and turns it on for Feroceon processors where the gain is very noticeable. Signed-off-by: Nicolas Pitre Signed-off-by: Lennert Buytenhek --- arch/arm/lib/copy_template.S | 12 ++---------- arch/arm/lib/memmove.S | 12 ++---------- 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/arm/lib') diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index cab355c0c1f7..139cce646055 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -12,14 +12,6 @@ * published by the Free Software Foundation. */ -/* - * This can be used to enable code to cacheline align the source pointer. - * Experiments on tested architectures (StrongARM and XScale) didn't show - * this a worthwhile thing to do. That might be different in the future. - */ -//#define CALGN(code...) code -#define CALGN(code...) - /* * Theory of operation * ------------------- @@ -82,7 +74,7 @@ stmfd sp!, {r5 - r8} blt 5f - CALGN( ands ip, r1, #31 ) + CALGN( ands ip, r0, #31 ) CALGN( rsb r3, ip, #32 ) CALGN( sbcnes r4, r3, r2 ) @ C is always set here CALGN( bcs 2f ) @@ -168,7 +160,7 @@ subs r2, r2, #28 blt 14f - CALGN( ands ip, r1, #31 ) + CALGN( ands ip, r0, #31 ) CALGN( rsb ip, ip, #32 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index 018522c3ff26..2e301b7bd8f1 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S @@ -13,14 +13,6 @@ #include #include -/* - * This can be used to enable code to cacheline align the source pointer. - * Experiments on tested architectures (StrongARM and XScale) didn't show - * this a worthwhile thing to do. That might be different in the future. - */ -//#define CALGN(code...) code -#define CALGN(code...) - .text /* @@ -55,7 +47,7 @@ ENTRY(memmove) stmfd sp!, {r5 - r8} blt 5f - CALGN( ands ip, r1, #31 ) + CALGN( ands ip, r0, #31 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( bcs 2f ) CALGN( adr r4, 6f ) @@ -139,7 +131,7 @@ ENTRY(memmove) subs r2, r2, #28 blt 14f - CALGN( ands ip, r1, #31 ) + CALGN( ands ip, r0, #31 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) -- cgit From f91a8dcc25398c5d708056de081d6cebf3f2023e Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 11 Apr 2008 21:04:28 -0400 Subject: [ARM] cache align memset and memzero This is a natural extension following the previous patch. Non Feroceon based targets are unchanged. Signed-off-by: Nicolas Pitre Signed-off-by: Lennert Buytenhek --- arch/arm/lib/memset.S | 46 ++++++++++++++++++++++++++++++++++++++++++++++ arch/arm/lib/memzero.S | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) (limited to 'arch/arm/lib') diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 95b110b07a89..b477d4ac88ef 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -39,6 +39,9 @@ ENTRY(memset) mov r3, r1 cmp r2, #16 blt 4f + +#if ! CALGN(1)+0 + /* * We need an extra register for this loop - save the return address and * use the LR @@ -64,6 +67,49 @@ ENTRY(memset) stmneia r0!, {r1, r3, ip, lr} ldr lr, [sp], #4 +#else + +/* + * This version aligns the destination pointer in order to write + * whole cache lines at once. + */ + + stmfd sp!, {r4-r7, lr} + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov ip, r1 + mov lr, r1 + + cmp r2, #96 + tstgt r0, #31 + ble 3f + + and ip, r0, #31 + rsb ip, ip, #32 + sub r2, r2, ip + movs ip, ip, lsl #(32 - 4) + stmcsia r0!, {r4, r5, r6, r7} + stmmiia r0!, {r4, r5} + tst ip, #(1 << 30) + mov ip, r1 + strne r1, [r0], #4 + +3: subs r2, r2, #64 + stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia r0!, {r1, r3-r7, ip, lr} + bgt 3b + ldmeqfd sp!, {r4-r7, pc} + + tst r2, #32 + stmneia r0!, {r1, r3-r7, ip, lr} + tst r2, #16 + stmneia r0!, {r4-r7} + ldmfd sp!, {r4-r7, lr} + +#endif + 4: tst r2, #8 stmneia r0!, {r1, r3} tst r2, #4 diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S index abf2508e8221..b8f79d80ee9b 100644 --- a/arch/arm/lib/memzero.S +++ b/arch/arm/lib/memzero.S @@ -39,6 +39,9 @@ ENTRY(__memzero) */ cmp r1, #16 @ 1 we can skip this chunk if we blt 4f @ 1 have < 16 bytes + +#if ! CALGN(1)+0 + /* * We need an extra register for this loop - save the return address and * use the LR @@ -64,6 +67,47 @@ ENTRY(__memzero) stmneia r0!, {r2, r3, ip, lr} @ 4 ldr lr, [sp], #4 @ 1 +#else + +/* + * This version aligns the destination pointer in order to write + * whole cache lines at once. + */ + + stmfd sp!, {r4-r7, lr} + mov r4, r2 + mov r5, r2 + mov r6, r2 + mov r7, r2 + mov ip, r2 + mov lr, r2 + + cmp r1, #96 + andgts ip, r0, #31 + ble 3f + + rsb ip, ip, #32 + sub r1, r1, ip + movs ip, ip, lsl #(32 - 4) + stmcsia r0!, {r4, r5, r6, r7} + stmmiia r0!, {r4, r5} + movs ip, ip, lsl #2 + strcs r2, [r0], #4 + +3: subs r1, r1, #64 + stmgeia r0!, {r2-r7, ip, lr} + stmgeia r0!, {r2-r7, ip, lr} + bgt 3b + ldmeqfd sp!, {r4-r7, pc} + + tst r1, #32 + stmneia r0!, {r2-r7, ip, lr} + tst r1, #16 + stmneia r0!, {r4-r7} + ldmfd sp!, {r4-r7, lr} + +#endif + 4: tst r1, #8 @ 1 8 bytes or more? stmneia r0!, {r2, r3} @ 2 tst r1, #4 @ 1 4 bytes or more? -- cgit