sh: Optimised memset for SH4 (dfc34940) · Commits · jan.koester / Linux

arch/sh/lib/Makefile

+5 −2

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@
		# Makefile for SuperH-specific library files..
		#

		lib-y = delay.o memset.o memmove.o memchr.o \
		lib-y = delay.o memmove.o memchr.o \
		checksum.o strlen.o div64.o div64-generic.o

		# Extracted from libgcc
		@@ -23,8 +23,11 @@ obj-y += io.o
		memcpy-y := memcpy.o
		memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o

		memset-y := memset.o
		memset-$(CONFIG_CPU_SH4) := memset-sh4.o

		lib-$(CONFIG_MMU) += copy_page.o __clear_user.o
		lib-$(CONFIG_MCOUNT) += mcount.o
		lib-y += $(memcpy-y) $(udivsi3-y)
		lib-y += $(memcpy-y) $(memset-y) $(udivsi3-y)

		EXTRA_CFLAGS += -Werror

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		/*
		* "memset" implementation for SH4
		*
		* Copyright (C) 1999 Niibe Yutaka
		* Copyright (c) 2009 STMicroelectronics Limited
		* Author: Stuart Menefy <stuart.menefy:st.com>
		*/

		/*
		* void memset(void s, int c, size_t n);
		*/

		#include <linux/linkage.h>

		ENTRY(memset)
		mov #12,r0
		add r6,r4
		cmp/gt r6,r0
		bt/s 40f ! if it's too small, set a byte at once
		mov r4,r0
		and #3,r0
		cmp/eq #0,r0
		bt/s 2f ! It's aligned
		sub r0,r6
		1:
		dt r0
		bf/s 1b
		mov.b r5,@-r4
		2: ! make VVVV
		extu.b r5,r5
		swap.b r5,r0 ! V0
		or r0,r5 ! VV
		swap.w r5,r0 ! VV00
		or r0,r5 ! VVVV

		! Check if enough bytes need to be copied to be worth the big loop
		mov #0x40, r0 ! (MT)
		cmp/gt r6,r0 ! (MT) 64 > len => slow loop

		bt/s 22f
		mov r6,r0

		! align the dst to the cache block size if necessary
		mov r4, r3
		mov #~(0x1f), r1

		and r3, r1
		cmp/eq r3, r1

		bt/s 11f ! dst is already aligned
		sub r1, r3 ! r3-r1 -> r3
		shlr2 r3 ! number of loops

		10: mov.l r5,@-r4
		dt r3
		bf/s 10b
		add #-4, r6

		11: ! dst is 32byte aligned
		mov r6,r2
		mov #-5,r0
		shld r0,r2 ! number of loops

		add #-32, r4
		mov r5, r0
		12:
		movca.l r0,@r4
		mov.l r5,@(4, r4)
		mov.l r5,@(8, r4)
		mov.l r5,@(12,r4)
		mov.l r5,@(16,r4)
		mov.l r5,@(20,r4)
		add #-0x20, r6
		mov.l r5,@(24,r4)
		dt r2
		mov.l r5,@(28,r4)
		bf/s 12b
		add #-32, r4

		add #32, r4
		mov #8, r0
		cmp/ge r0, r6
		bf 40f

		mov r6,r0
		22:
		shlr2 r0
		shlr r0 ! r0 = r6 >> 3
		3:
		dt r0
		mov.l r5,@-r4 ! set 8-byte at once
		bf/s 3b
		mov.l r5,@-r4
		!
		mov #7,r0
		and r0,r6

		! fill bytes (length may be zero)
		40: tst r6,r6
		bt 5f
		4:
		dt r6
		bf/s 4b
		mov.b r5,@-r4
		5:
		rts
		mov r4,r0