master

分支 (29)

标签 (45)

管理

管理

master

pac

openEuler-24.03-LTS

openEuler-24.03-LTS-Next

openEuler-24.03-LTS-SP1

openEuler-20.03-LTS-SP4

openEuler-25.03

2503-pacbti

openEuler-22.03-LTS-SP4

openEuler-22.03-LTS-SP1

openEuler-22.03-LTS-SP3

openEuler-24.09

openEuler-22.03-LTS-SP2

openEuler-22.03-LTS-Next

openEuler-22.03-LTS

openEuler-20.03-LTS-SP1

openEuler-20.03-LTS-SP3

openEuler-23.09

openEuler-23.03

openEuler-22.03-LTS-LoongArch

openEuler-22.03-LTS-SP3-update-20250307

openEuler-22.03-LTS-SP4-update-20250307

openEuler-20.03-LTS-SP4-update-20250307

openEuler-24.03-LTS-SP1-update-20250221

openEuler-24.03-LTS-update-20250221

openEuler-24.03-LTS-SP1-release

openEuler-24.03-LTS-update-20241213

openEuler-22.03-LTS-SP4-update-20241213

openEuler-22.03-LTS-SP3-update-20241213

openEuler-22.03-LTS-SP1-update-20241213

openEuler-22.03-LTS-SP4-update-20241206

openEuler-22.03-LTS-SP3-update-20241206

openEuler-22.03-LTS-SP1-update-20241206

openEuler-22.03-LTS-SP1-update-20241129

openEuler-22.03-LTS-SP3-update-20241129

openEuler-22.03-LTS-SP4-update-20241129

openEuler-22.03-LTS-SP4-update-20241122

openEuler-22.03-LTS-SP3-update-20241122

openEuler-22.03-LTS-SP4-update-20241108

openEuler-22.03-LTS-SP3-update-20241108

glibc
/
0011-Sw64-Integer-Operation-Support.p...

From 8045463341b2495da7b2e7dc308a023764315bbe Mon Sep 17 00:00:00 2001
From: swcompiler <lc@wxiat.com>
Date: Fri, 29 Nov 2024 14:15:45 +0800
Subject: [PATCH 11/23] Sw64: Integer Operation Support

---
 sysdeps/sw_64/add_n.S         | 118 +++++++++
 sysdeps/sw_64/addmul_1.S      |  89 +++++++
 sysdeps/sw_64/bzero.S         | 107 ++++++++
 sysdeps/sw_64/div.S           |  83 ++++++
 sysdeps/sw_64/div_libc.h      | 170 ++++++++++++
 sysdeps/sw_64/divl.S          |  96 +++++++
 sysdeps/sw_64/divlu.S         |   4 +
 sysdeps/sw_64/divq.S          | 290 +++++++++++++++++++++
 sysdeps/sw_64/divqu.S         | 292 +++++++++++++++++++++
 sysdeps/sw_64/htonl.S         |  43 +++
 sysdeps/sw_64/htons.S         |  39 +++
 sysdeps/sw_64/ldiv.S          | 222 ++++++++++++++++
 sysdeps/sw_64/lldiv.S         |   1 +
 sysdeps/sw_64/lshift.S        | 107 ++++++++
 sysdeps/sw_64/mul_1.S         |  82 ++++++
 sysdeps/sw_64/reml.S          |  93 +++++++
 sysdeps/sw_64/remlu.S         |   4 +
 sysdeps/sw_64/remq.S          | 274 ++++++++++++++++++++
 sysdeps/sw_64/remqu.S         | 292 +++++++++++++++++++++
 sysdeps/sw_64/rshift.S        | 105 ++++++++
 sysdeps/sw_64/sub_n.S         | 118 +++++++++
 sysdeps/sw_64/submul_1.S      |  89 +++++++
 sysdeps/sw_64/sw6a/add_n.S    | 146 +++++++++++
 sysdeps/sw_64/sw6a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
 sysdeps/sw_64/sw6a/lshift.S   | 172 ++++++++++++
 sysdeps/sw_64/sw6a/rshift.S   | 170 ++++++++++++
 sysdeps/sw_64/sw6a/sub_n.S    | 147 +++++++++++
 sysdeps/sw_64/sw6b/add_n.S    | 146 +++++++++++
 sysdeps/sw_64/sw6b/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
 sysdeps/sw_64/sw6b/lshift.S   | 172 ++++++++++++
 sysdeps/sw_64/sw6b/memcpy.S   | 416 +++++++++++++++++++++++++++++
 sysdeps/sw_64/sw6b/memset.S   | 312 ++++++++++++++++++++++
 sysdeps/sw_64/sw6b/rshift.S   | 170 ++++++++++++
 sysdeps/sw_64/sw6b/stxcpy.S   | 314 ++++++++++++++++++++++
 sysdeps/sw_64/sw6b/stxncpy.S  | 392 ++++++++++++++++++++++++++++
 sysdeps/sw_64/sw6b/sub_n.S    | 147 +++++++++++
 sysdeps/sw_64/sw8a/add_n.S    | 146 +++++++++++
 sysdeps/sw_64/sw8a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
 sysdeps/sw_64/sw8a/lshift.S   | 172 ++++++++++++
 sysdeps/sw_64/sw8a/rshift.S   | 170 ++++++++++++
 sysdeps/sw_64/sw8a/sub_n.S    | 147 +++++++++++
 sysdeps/sw_64/udiv_qrnnd.S    | 159 ++++++++++++
 42 files changed, 7641 insertions(+)
 create mode 100644 sysdeps/sw_64/add_n.S
 create mode 100644 sysdeps/sw_64/addmul_1.S
 create mode 100644 sysdeps/sw_64/bzero.S
 create mode 100644 sysdeps/sw_64/div.S
 create mode 100644 sysdeps/sw_64/div_libc.h
 create mode 100644 sysdeps/sw_64/divl.S
 create mode 100644 sysdeps/sw_64/divlu.S
 create mode 100644 sysdeps/sw_64/divq.S
 create mode 100644 sysdeps/sw_64/divqu.S
 create mode 100644 sysdeps/sw_64/htonl.S
 create mode 100644 sysdeps/sw_64/htons.S
 create mode 100644 sysdeps/sw_64/ldiv.S
 create mode 100644 sysdeps/sw_64/lldiv.S
 create mode 100644 sysdeps/sw_64/lshift.S
 create mode 100644 sysdeps/sw_64/mul_1.S
 create mode 100644 sysdeps/sw_64/reml.S
 create mode 100644 sysdeps/sw_64/remlu.S
 create mode 100644 sysdeps/sw_64/remq.S
 create mode 100644 sysdeps/sw_64/remqu.S
 create mode 100644 sysdeps/sw_64/rshift.S
 create mode 100644 sysdeps/sw_64/sub_n.S
 create mode 100644 sysdeps/sw_64/submul_1.S
 create mode 100644 sysdeps/sw_64/sw6a/add_n.S
 create mode 100644 sysdeps/sw_64/sw6a/addmul_1.S
 create mode 100644 sysdeps/sw_64/sw6a/lshift.S
 create mode 100644 sysdeps/sw_64/sw6a/rshift.S
 create mode 100644 sysdeps/sw_64/sw6a/sub_n.S
 create mode 100644 sysdeps/sw_64/sw6b/add_n.S
 create mode 100644 sysdeps/sw_64/sw6b/addmul_1.S
 create mode 100644 sysdeps/sw_64/sw6b/lshift.S
 create mode 100644 sysdeps/sw_64/sw6b/memcpy.S
 create mode 100644 sysdeps/sw_64/sw6b/memset.S
 create mode 100644 sysdeps/sw_64/sw6b/rshift.S
 create mode 100644 sysdeps/sw_64/sw6b/stxcpy.S
 create mode 100644 sysdeps/sw_64/sw6b/stxncpy.S
 create mode 100644 sysdeps/sw_64/sw6b/sub_n.S
 create mode 100644 sysdeps/sw_64/sw8a/add_n.S
 create mode 100644 sysdeps/sw_64/sw8a/addmul_1.S
 create mode 100644 sysdeps/sw_64/sw8a/lshift.S
 create mode 100644 sysdeps/sw_64/sw8a/rshift.S
 create mode 100644 sysdeps/sw_64/sw8a/sub_n.S
 create mode 100644 sysdeps/sw_64/udiv_qrnnd.S

diff --git a/sysdeps/sw_64/add_n.S b/sysdeps/sw_64/add_n.S
new file mode 100644
index 00000000..8c5c8c08
--- /dev/null
+++ b/sysdeps/sw_64/add_n.S
@@ -0,0 +1,118 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	ldl	$3,0($17)
+	ldl	$4,0($18)
+
+	subl	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if fmuldiple of 4 limbs, skip first loop
+
+	subl	$19,$2,$19
+
+.Loop0:	subl	$2,1,$2
+	ldl	$5,8($17)
+	addl	$4,$0,$4
+	ldl	$6,8($18)
+	cmpult	$4,$0,$1
+	addl	$3,$4,$4
+	cmpult	$4,$3,$0
+	stl	$4,0($16)
+	or	$0,$1,$0
+
+	addl	$17,8,$17
+	addl	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addl	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subl	$19,4,$19
+
+	ldl	$5,8($17)
+	addl	$4,$0,$4
+	ldl	$6,8($18)
+	cmpult	$4,$0,$1
+	addl	$3,$4,$4
+	cmpult	$4,$3,$0
+	stl	$4,0($16)
+	or	$0,$1,$0
+
+	ldl	$3,16($17)
+	addl	$6,$0,$6
+	ldl	$4,16($18)
+	cmpult	$6,$0,$1
+	addl	$5,$6,$6
+	cmpult	$6,$5,$0
+	stl	$6,8($16)
+	or	$0,$1,$0
+
+	ldl	$5,24($17)
+	addl	$4,$0,$4
+	ldl	$6,24($18)
+	cmpult	$4,$0,$1
+	addl	$3,$4,$4
+	cmpult	$4,$3,$0
+	stl	$4,16($16)
+	or	$0,$1,$0
+
+	ldl	$3,32($17)
+	addl	$6,$0,$6
+	ldl	$4,32($18)
+	cmpult	$6,$0,$1
+	addl	$5,$6,$6
+	cmpult	$6,$5,$0
+	stl	$6,24($16)
+	or	$0,$1,$0
+
+	addl	$17,32,$17
+	addl	$18,32,$18
+	addl	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addl	$4,$0,$4
+	cmpult	$4,$0,$1
+	addl	$3,$4,$4
+	cmpult	$4,$3,$0
+	stl	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_add_n
diff --git a/sysdeps/sw_64/addmul_1.S b/sysdeps/sw_64/addmul_1.S
new file mode 100644
index 00000000..138e3c69
--- /dev/null
+++ b/sysdeps/sw_64/addmul_1.S
@@ -0,0 +1,89 @@
+ # Sw_64 1621 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1 2
+__mpn_addmul_1:
+	.frame	$30,0,$26
+
+	ldl	$2,0($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	subl	$18,1,$18	# size--
+	mull	$2,$19,$3	# $3 = prod_low
+	ldl	$5,0($16)	# $5 = *res_ptr
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,.Lend1	# jump if size was == 1
+	ldl	$2,0($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	subl	$18,1,$18	# size--
+	addl	$5,$3,$3
+	cmpult	$3,$5,$4
+	stl	$3,0($16)
+	addl	$16,8,$16	# res_ptr++
+	beq	$18,.Lend2	# jump if size was == 2
+
+	.align	3
+.Loop:	mull	$2,$19,$3	# $3 = prod_low
+	ldl	$5,0($16)	# $5 = *res_ptr
+	addl	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subl	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldl	$2,0($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	addl	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,$3,$3
+	cmpult	$3,$5,$5
+	stl	$3,0($16)
+	addl	$16,8,$16	# res_ptr++
+	addl	$5,$0,$0	# combine carries
+	bne	$18,.Loop
+
+.Lend2:	mull	$2,$19,$3	# $3 = prod_low
+	ldl	$5,0($16)	# $5 = *res_ptr
+	addl	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addl	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,$3,$3
+	cmpult	$3,$5,$5
+	stl	$3,0($16)
+	addl	$5,$0,$0	# combine carries
+	addl	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+.Lend1:	addl	$5,$3,$3
+	cmpult	$3,$5,$5
+	stl	$3,0($16)
+	addl	$0,$5,$0
+	ret	$31,($26),1
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/sw_64/bzero.S b/sysdeps/sw_64/bzero.S
new file mode 100644
index 00000000..1a020afd
--- /dev/null
+++ b/sysdeps/sw_64/bzero.S
@@ -0,0 +1,107 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Fill a block of memory with zeros.  Optimized for the Sw_64 architecture:
+
+   - memory accessed as aligned quadwords only
+   - destination memory not read unless needed for good cache behaviour
+   - basic blocks arranged to optimize branch prediction for full-quadword
+     aligned memory blocks.
+   - partial head and tail quadwords constructed with byte-mask instructions
+
+*/
+
+
+#include <sysdep.h>
+
+	.set noat
+	.set noreorder
+
+	.text
+	.type	__bzero, @function
+	.globl	__bzero
+	.usepv	__bzero, USEPV_PROF
+
+	cfi_startproc
+
+	/* On entry to this basic block:
+	   t3 == loop counter
+	   t4 == bytes in partial final word
+	   a0 == possibly misaligned destination pointer  */
+
+	.align 3
+bzero_loop:
+	beq	t3, $tail	#
+	blbc	t3, 0f		# skip single store if count even
+
+	stl_u	zero, 0(a0)	# e0    : store one word
+	subl	t3, 1, t3	# .. e1 :
+	addl	a0, 8, a0	# e0    :
+	beq	t3, $tail	# .. e1 :
+
+0:	stl_u	zero, 0(a0)	# e0    : store two words
+	subl	t3, 2, t3	# .. e1 :
+	stl_u	zero, 8(a0)	# e0    :
+	addl	a0, 16, a0	# .. e1 :
+	bne	t3, 0b		# e1    :
+
+$tail:	bne	t4, 1f		# is there a tail to do?
+	ret			# no
+
+1:	ldl_u	t0, 0(a0)	# yes, load original data
+	mask7b	t0, t4, t0	#
+	stl_u	t0, 0(a0)	#
+	ret			#
+
+__bzero:
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+#endif
+
+	mov	a0, v0		# e0    : move return value in place
+	beq	a1, $done	# .. e1 : early exit for zero-length store
+	and	a0, 7, t1	# e0    :
+	addl	a1, t1, a1	# e1    : add dest misalignment to count
+	srl	a1, 3, t3	# e0    : loop = count >> 3
+	and	a1, 7, t4	# .. e1 : find number of bytes in tail
+	unop			#       :
+	beq	t1, bzero_loop	# e1    : aligned head, jump right in
+
+	ldl_u	t0, 0(a0)	# e0    : load original data to mask into
+	cmpult	a1, 8, t2	# .. e1 : is this a sub-word set
+	bne	t2, $oneq	# e1    :
+
+	mask3b	t0, a0, t0	# e0    : we span words.  finish this partial
+	subl	t3, 1, t3	# e0    :
+	addl	a0, 8, a0	# .. e1 :
+	stl_u	t0, -8(a0)	# e0    :
+	br 	bzero_loop	# .. e1 :
+
+	.align 3
+$oneq:
+	mask3b	t0, a0, t2	# e0    :
+	mask7b	t0, a1, t3	# e0    :
+	or	t2, t3, t0	# e1    :
+	stl_u	t0, 0(a0)	# e0    :
+
+$done:	ret
+
+	cfi_endproc
+weak_alias (__bzero, bzero)
diff --git a/sysdeps/sw_64/div.S b/sysdeps/sw_64/div.S
new file mode 100644
index 00000000..6dbdcb7f
--- /dev/null
+++ b/sysdeps/sw_64/div.S
@@ -0,0 +1,83 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson <rth@tamu.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+#undef FRAME
+#ifdef __sw_64_fix__
+#define FRAME 0
+#else
+#define FRAME 16
+#endif
+
+	.set noat
+
+	.align 4
+	.globl div
+	.ent div
+div:
+	.frame sp, FRAME, ra
+#if FRAME > 0
+	ldi	sp, -FRAME(sp)
+#endif
+#ifdef PROF
+	.set	macro
+	ldgp	gp, 0(pv)
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.set	nomacro
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	beq	$18, $divbyzero
+	rfpcr $f10
+	_ITOFT2 $17, $f0, 0, $18, $f1, 8
+	fcvtld  $f0, $f11
+	fcvtld  $f1, $f12
+	fdivd   $f11, $f12, $f1
+	fcvtdl_z  $f1, $f0
+	wfpcr $f10
+	_FTOIT	$f0, $0, 0
+
+	mulw	$0, $18, $1
+	subw	$17, $1, $1
+
+	stw	$0, 0(a0)
+	stw	$1, 4(a0)
+	mov	a0, v0
+
+#if FRAME > 0
+	ldi	sp, FRAME(sp)
+#endif
+	ret
+
+$divbyzero:
+	mov	a0, v0
+	ldi	a0, GEN_INTDIV
+	sys_call HMC_gentrap
+	stw	zero, 0(v0)
+	stw	zero, 4(v0)
+
+#if FRAME > 0
+	ldi	sp, FRAME(sp)
+#endif
+	ret
+
+	.end div
diff --git a/sysdeps/sw_64/div_libc.h b/sysdeps/sw_64/div_libc.h
new file mode 100644
index 00000000..2066924b
--- /dev/null
+++ b/sysdeps/sw_64/div_libc.h
@@ -0,0 +1,170 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Common bits for implementing software divide.  */
+
+#include <sysdep.h>
+#ifdef __linux__
+# include <asm/gentrap.h>
+# include <asm/hmcall.h>
+#else
+# include <machine/pal.h>
+#endif
+
+/* These are not normal C functions.  Argument registers are t10 and t11;
+   the result goes in t12; the return address is in t9.  Only t12 and AT
+   may be clobbered.  */
+#define X	t10
+#define Y	t11
+#define RV	t12
+#define RA	t9
+
+/* The secureplt format does not allow the division routines to be called
+   via plt; there aren't enough registers free to be clobbered.  Avoid
+   setting the symbol type to STT_FUNC, so that the linker won't be tempted
+   to create a plt entry.  */
+#define funcnoplt notype
+
+/* None of these functions should use implicit anything.  */
+	.set	nomacro
+	.set	noat
+
+/* Code fragment to invoke _mcount for profiling.  This should be invoked
+   directly after allocation of the stack frame.  */
+.macro CALL_MCOUNT
+#ifdef PROF
+	stl	ra, 0(sp)
+	stl	pv, 8(sp)
+	stl	gp, 16(sp)
+	cfi_rel_offset (ra, 0)
+	cfi_rel_offset (pv, 8)
+	cfi_rel_offset (gp, 16)
+	br	AT, 1f
+	.set	macro
+1:	ldgp	gp, 0(AT)
+	mov	RA, ra
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.set	nomacro
+	ldl	ra, 0(sp)
+	ldl	pv, 8(sp)
+	ldl	gp, 16(sp)
+	cfi_restore (ra)
+	cfi_restore (pv)
+	cfi_restore (gp)
+	/* Realign subsequent code with what we'd have without this
+	   macro at all.  This means aligned with one arithmetic insn
+	   used within the bundle.  */
+	.align	4
+	nop
+#endif
+.endm
+
+/* In order to make the below work, all top-level divide routines must
+   use the same frame size.  */
+#define FRAME 96
+
+/* Code fragment to generate an integer divide-by-zero fault.  When
+   building libc.so, we arrange for there to be one copy of this code
+   placed late in the dso, such that all branches are forward.  When
+   building libc.a, we use multiple copies to avoid having an out of
+   range branch.  Users should jump to DIVBYZERO.  */
+
+.macro DO_DIVBYZERO
+#ifdef PIC
+#define DIVBYZERO	__divbyzero
+	.section .gnu.linkonce.t.divbyzero, "ax", @progbits
+	.globl	__divbyzero
+	.type	__divbyzero, @function
+	.usepv	__divbyzero, no
+	.hidden	__divbyzero
+#else
+#define DIVBYZERO	$divbyzero
+#endif
+
+	.align	4
+DIVBYZERO:
+	cfi_startproc
+	cfi_return_column (RA)
+	cfi_def_cfa_offset (FRAME)
+
+	mov	a0, RV
+	unop
+	ldi	a0, GEN_INTDIV
+	sys_call HMC_gentrap
+
+	mov	RV, a0
+	clr	RV
+	ldi	sp, FRAME(sp)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	DIVBYZERO, .-DIVBYZERO
+.endm
+
+/* Like the sw6a instructions, but fall back to stack use on prior machines.  */
+#ifdef __sw_64_sw6a__
+	.arch   sw6a
+#endif
+#ifdef __sw_64_sw6b__
+	.arch   sw6b
+#endif
+#ifdef __sw_64_sw8a__
+	.arch   sw8a
+#endif
+
+.macro _ITOFS  gr, fr, slot
+#ifdef __sw_64_fix__
+	ifmovs	\gr, \fr
+#else
+	stw	\gr, \slot(sp)
+	flds	\fr, \slot(sp)
+#endif
+.endm
+
+.macro _ITOFT  gr, fr, slot
+#ifdef __sw_64_fix__
+	ifmovd	\gr, \fr
+#else
+	stl	\gr, \slot(sp)
+	fldd	\fr, \slot(sp)
+#endif
+.endm
+
+.macro _FTOIT  fr, gr, slot
+#ifdef __sw_64_fix__
+	fimovd	\fr, \gr
+#else
+	fstd	\fr, \slot(sp)
+	ldl	\gr, \slot(sp)
+#endif
+.endm
+
+/* Similarly, but move two registers.  Schedules better for pre-sw6a.  */
+
+.macro _ITOFT2 gr1, fr1, slot1, gr2, fr2, slot2
+#ifdef __sw_64_fix__
+	ifmovd	\gr1, \fr1
+	ifmovd	\gr2, \fr2
+#else
+	stl	\gr1, \slot1(sp)
+	stl	\gr2, \slot2(sp)
+	fldd	\fr1, \slot1(sp)
+	fldd	\fr2, \slot2(sp)
+#endif
+.endm
diff --git a/sysdeps/sw_64/divl.S b/sysdeps/sw_64/divl.S
new file mode 100644
index 00000000..1192a0aa
--- /dev/null
+++ b/sysdeps/sw_64/divl.S
@@ -0,0 +1,96 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+/* 32-bit signed int divide.  This is not a normal C function.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   The FPU can handle all input values except zero.  Whee!
+
+   The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+   for cvttq/c even without /sui being set.  It will not, however, properly
+   raise the exception, so we don't have to worry about FPCR_INED being clear
+   and so dying by SIGFPE.  */
+
+  /*****************************************************************
+  #								*
+  #  transform to sw-instruct on 2016111216		    *
+  #								*
+  #****************************************************************/
+
+#ifndef EXTEND
+#define EXTEND(S,D)	sextl S, D
+#endif
+
+	.text
+	.align	4
+	.globl	__divw
+	.type	__divw, @funcnoplt
+	.usepv	__divw, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__divw:
+	ldi     sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+	fstd     $f0, 0(sp)
+	excb
+	beq     Y, DIVBYZERO
+
+	fstd     $f1, 8(sp)
+	fstd     $f2, 16(sp)
+	fstd     $f3, 40(sp)
+	fstd     $f4, 48(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	cfi_rel_offset ($f2, 16)
+	cfi_rel_offset ($f3, 40)
+	cfi_rel_offset ($f4, 48)
+
+	rfpcr	$f2
+	EXTEND  (X, RV)
+	EXTEND  (Y, AT)
+	_ITOFT2 RV, $f0, 24, AT, $f1, 32
+	fcvtld   $f0, $f3
+	fcvtld   $f1, $f4
+	fdivd    $f3, $f4, $f1
+	fcvtdl_z $f1, $f0
+	wfpcr	$f2
+	_FTOIT	$f0, RV, 24
+
+	fldd	$f0, 0(sp)
+	fldd	$f1, 8(sp)
+	fldd	$f2, 16(sp)
+	fldd	$f3, 40(sp)
+	fldd	$f4, 48(sp)
+	ldi	sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_restore ($f2)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_def_cfa_offset (0)
+	sextl   RV, RV
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__divw, .-__divw
+
+	DO_DIVBYZERO
diff --git a/sysdeps/sw_64/divlu.S b/sysdeps/sw_64/divlu.S
new file mode 100644
index 00000000..26e1842f
--- /dev/null
+++ b/sysdeps/sw_64/divlu.S
@@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D)	zapnot S, 15, D
+#define __divw		__divwu
+#include <divl.S>
diff --git a/sysdeps/sw_64/divq.S b/sysdeps/sw_64/divq.S
new file mode 100644
index 00000000..61ef58b4
--- /dev/null
+++ b/sysdeps/sw_64/divq.S
@@ -0,0 +1,290 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long divide.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.
+
+   The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+   for cvttq/c even without /sui being set.  It will not, however, properly
+   raise the exception, so we don't have to worry about FPCR_INED being clear
+   and so dying by SIGFPE.  */
+  /*****************************************************************
+  #								*
+  #  transform to sw-instruct on 2016111216		    *
+  #								*
+  #****************************************************************/
+	.text
+	.align	4
+	.globl	__divl
+	.type	__divl, @funcnoplt
+	.usepv	__divl, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__divl:
+	ldi     sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	fstd     $f0, 0(sp)
+	excb
+	beq     Y, DIVBYZERO
+
+	fstd     $f1, 8(sp)
+	fstd     $f3, 48(sp)
+	fstd     $f4, 56(sp)
+	fstd     $f5, 64(sp)
+
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	cfi_rel_offset ($f3, 48)
+	cfi_rel_offset ($f4, 56)
+	cfi_rel_offset ($f5, 64)
+	rfpcr	$f3
+
+	_ITOFT2 X, $f0, 16, Y, $f1, 24
+	fcvtld   $f0, $f4
+	fcvtld   $f1, $f5
+	fdivd  $f4, $f5, $f0
+
+	/* Check to see if X fit in the double as an exact value.  */
+	sll     X, (64-53), AT
+	fldd     $f1, 8(sp)
+	sra     AT, (64-53), AT
+	cmpeq   X, AT, AT
+	beq     AT, $x_big
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert and clean up.  */
+	fcvtdl_z $f0, $f4
+	excb
+
+	wfpcr	$f3
+	_FTOIT  $f4, RV, 16
+	fldd     $f0, 0(sp)
+	fldd     $f3, 48(sp)
+	fldd     $f4, 56(sp)
+	fldd     $f5, 64(sp)
+	cfi_restore ($f1)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ldi     sp, FRAME(sp)
+	ret     $31, (RA), 1
+
+	.align  4
+	cfi_restore_state
+
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stl	t0, 32(sp)
+	stl	t1, 40(sp)
+	stl	t2, 16(sp)
+	stl	t5, 24(sp)
+	cfi_rel_offset (t0, 32)
+	cfi_rel_offset (t1, 40)
+	cfi_rel_offset (t2, 16)
+	cfi_rel_offset (t5, 24)
+
+#define Q	RV		/* quotient */
+#define R	t0		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	/* The fixup code below can only handle unsigned values.  */
+	or	X, Y, AT
+	mov	$31, t5
+	blt	AT, $fix_sign_in
+$fix_sign_in_ret1:
+	fcvtdl_z $f0, $f4
+
+	_FTOIT	$f4, Q, 8
+	.align	3
+$fix_sign_in_ret2:
+	fldd	$f0, 0(sp)
+	stl	t3, 0(sp)
+	cfi_restore ($f0)
+	cfi_rel_offset (t3, 0)
+
+	mull	Q, Y, QY
+	excb
+	stl	t4, 8(sp)
+	wfpcr	$f3
+	cfi_rel_offset (t4, 8)
+
+	subl	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subl	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldl	t0, 32(sp)
+	ldl	t1, 40(sp)
+	ldl	t2, 16(sp)
+	bne	t5, $fix_sign_out
+
+$fix_sign_out_ret:
+	ldl	t3, 0(sp)
+	ldl	t4, 8(sp)
+	ldl	t5, 24(sp)
+	fldd	$f3, 48(sp)
+	fldd    $f4, 56(sp)
+	fldd    $f5, 64(sp)
+	ldi	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_restore (t5)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subl	Q, S, Q
+	unop
+	subl	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addl	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subl	R, SY, t4
+
+	selne	AT, t3, Q, Q
+	selne	AT, t4, R, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+$fix_sign_in:
+	/* If we got here, then X|Y is negative.  Need to adjust everything
+	   such that we're doing unsigned division in the fixup loop.  */
+	/* T5 records the changes we had to make:
+		bit 0:	set if result should be negative.
+		bit 2:	set if X was negated.
+		bit 3:	set if Y was negated.
+	*/
+	xor	X, Y, AT
+	cmplt	AT, 0, t5
+	cmplt	X, 0, AT
+	negl	X, t0
+
+	s4addl	AT, t5, t5
+	selne	AT, t0, X, X
+	cmplt	Y, 0, AT
+	negl	Y, t0
+
+	s8addl	AT, t5, t5
+	selne	AT, t0, Y, Y
+	unop
+	blbc	t5, $fix_sign_in_ret1
+
+	fcvtdl_z $f0, $f4
+	_FTOIT	$f4, Q, 8
+	.align	3
+	negl	Q, Q
+	br	$fix_sign_in_ret2
+
+	.align	4
+$fix_sign_out:
+	/* Now we get to undo what we did above.  */
+	/* ??? Is this really faster than just increasing the size of
+	   the stack frame and storing X and Y in memory?  */
+	and	t5, 8, AT
+	negl	Y, t4
+	selne	AT, t4, Y, Y
+
+	and	t5, 4, AT
+	negl	X, t4
+	selne	AT, t4, X, X
+
+	negl	RV, t4
+	sellbs	t5, t4, RV, RV
+
+	br	$fix_sign_out_ret
+
+	cfi_endproc
+	.size	__divl, .-__divl
+
+	DO_DIVBYZERO
diff --git a/sysdeps/sw_64/divqu.S b/sysdeps/sw_64/divqu.S
new file mode 100644
index 00000000..7b39201e
--- /dev/null
+++ b/sysdeps/sw_64/divqu.S
@@ -0,0 +1,292 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long divide.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
+   clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.
+
+   The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+   for cvttq/c even without /sui being set.  It will not, however, properly
+   raise the exception, so we don't have to worry about FPCR_INED being clear
+   and so dying by SIGFPE.  */
+  /*  transform to sw-instruct on 2016111216 */
+	.text
+	.align	4
+	.globl	__divlu
+	.type	__divlu, @funcnoplt
+	.usepv	__divlu, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__divlu:
+	ldi	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	beq	Y, DIVBYZERO
+	fstd	$f0, 0(sp)
+	fstd	$f1, 8(sp)
+	fstd	$f3, 48(sp)
+	fstd	$f4, 56(sp)
+	fstd	$f5, 64(sp)
+	stl     t0,32(sp)
+	stl     t1,40(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	cfi_rel_offset ($f3, 48)
+	cfi_rel_offset ($f4, 56)
+	cfi_rel_offset ($f5, 64)
+	cfi_rel_offset (t0, 32)
+	cfi_rel_offset (t1, 40)
+
+	rfpcr $f3
+	/*add it for there has some err when with -mieee of
+	  0xffffffffffffffff/2*/
+	rfpcr $f1
+	fimovd $f1,t0
+	ldi   t1,3
+	sll   t1,58,t1
+	bic   t0,t1,t0
+	ifmovd t0,$f1
+	wfpcr $f1
+	_ITOFT2	X, $f0, 16, Y, $f1, 24
+	fcvtld	$f0, $f4
+	fcvtld	$f1, $f5
+	blt	X, $x_is_neg
+	fdivd	$f4, $f5, $f0
+
+	/* Check to see if Y was mis-converted as signed value.  */
+	fldd	$f1, 8(sp)
+	blt	Y, $y_is_neg
+
+	/* Check to see if X fit in the double as an exact value.  */
+	srl	X, 53, AT
+	bne	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert and clean up.  */
+	fcvtdl $f0, $f4
+	wfpcr $f3
+	_FTOIT	$f4, RV, 16
+
+	ldl     t0,32(sp)
+	ldl     t1,40(sp)
+	fldd	$f0, 0(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	ldi	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+$x_is_neg:
+	/* If we get here, X is so big that bit 63 is set, which made the
+	   conversion come out negative.  Fix it up lest we not even get
+	   a good estimate.  */
+	ldih	AT, 0x5f80		/* 2**64 as float.  */
+	fstd	$f2, 24(sp)
+	fstd	$f6, 72(sp)
+	cfi_rel_offset ($f2, 24)
+	cfi_rel_offset ($f5, 72)
+	_ITOFS	AT, $f2, 16
+
+	.align	4
+	faddd	$f4, $f2, $f6
+	unop
+	fdivd	$f6, $f5, $f0
+	unop
+
+	/* Ok, we've now the divide issued.  Continue with other checks.  */
+	fldd	$f1, 8(sp)
+	unop
+	fldd	$f2, 24(sp)
+	fldd	$f6, 72(sp)
+	blt	Y, $y_is_neg
+	cfi_restore ($f1)
+	cfi_restore ($f2)
+	cfi_restore ($f6)
+	cfi_remember_state	/* for y_is_neg */
+
+	.align	4
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stl	t2, 16(sp)
+	stl	t3, 24(sp)
+	cfi_rel_offset (t0, 32)
+	cfi_rel_offset (t1, 40)
+	cfi_rel_offset (t2, 16)
+	cfi_rel_offset (t3, 24)
+
+#define Q	RV		/* quotient */
+#define R	t0		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	fcvtdl $f0, $f4
+	_FTOIT	$f4, Q, 8
+	mull	Q, Y, QY
+
+	.align	4
+	stl	t4, 8(sp)
+	excb
+	fldd	$f0, 0(sp)
+	wfpcr	$f3
+	cfi_rel_offset (t4, 8)
+	cfi_restore ($f0)
+
+	subl	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subl	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldl	t4, 8(sp)
+	ldl	t0, 32(sp)
+	ldl	t1, 40(sp)
+	ldl	t2, 16(sp)
+
+	ldl	t3, 24(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	ldi	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subl	Q, S, Q
+	unop
+	subl	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addl	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subl	R, SY, t4
+
+	selne	AT, t3, Q, Q
+	selne	AT, t4, R, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+	cfi_restore_state
+$y_is_neg:
+	/* If we get here, Y is so big that bit 63 is set.  The results
+	   from the divide will be completely wrong.  Fortunately, the
+	   quotient must be either 0 or 1, so just compute it directly.  */
+	cmpule	Y, X, RV
+	excb
+	wfpcr	$f3
+	fldd	$f0, 0(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	ldl     t0,32(sp)
+	ldl     t1,40(sp)
+	ldi	sp, FRAME(sp)
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore ($f0)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+	cfi_endproc
+	.size	__divlu, .-__divlu
+
+	DO_DIVBYZERO
diff --git a/sysdeps/sw_64/htonl.S b/sysdeps/sw_64/htonl.S
new file mode 100644
index 00000000..7fc0aa24
--- /dev/null
+++ b/sysdeps/sw_64/htonl.S
@@ -0,0 +1,43 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(htonl)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	.set noat
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.set at
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	ins6b	a0, 7, t0	# t0 = 0000000000AABBCC
+	ins1b	a0, 3, t1	# t1 = 000000CCDD000000
+	or	t1, t0, t1	# t1 = 000000CCDDAABBCC
+	srl	t1, 16, t2	# t2 = 0000000000CCDDAA
+	zapnot	t1, 0x0A, t0	# t0 = 00000000DD00BB00
+	zapnot	t2, 0x05, t3	# t3 = 0000000000CC00AA
+	addw	t0, t3, v0	# v0 = ssssssssDDCCBBAA
+	ret
+
+	END(htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/sysdeps/sw_64/htons.S b/sysdeps/sw_64/htons.S
new file mode 100644
index 00000000..8a981be1
--- /dev/null
+++ b/sysdeps/sw_64/htons.S
@@ -0,0 +1,39 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(htons)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	.set noat
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.set at
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	ext5b	a0, 7, t1	# t1 = bb00
+	ext0b	a0, 1, v0	# v0 = 00aa
+	bis	v0, t1, v0	# v0 = bbaa
+	ret
+
+	END(htons)
+
+weak_alias (htons, ntohs)
diff --git a/sysdeps/sw_64/ldiv.S b/sysdeps/sw_64/ldiv.S
new file mode 100644
index 00000000..7a77d6dd
--- /dev/null
+++ b/sysdeps/sw_64/ldiv.S
@@ -0,0 +1,222 @@
+
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Richard Henderson <rth@tamu.edu>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include "div_libc.h"
+
+#undef FRAME
+#ifdef __sw_64_fix__
+#define FRAME 0
+#else
+#define FRAME 16
+#endif
+
+#undef X
+#undef Y
+#define X $17
+#define Y $18
+
+	.set noat
+
+	.align 4
+	.globl ldiv
+	.ent ldiv
+ldiv:
+	.frame sp, FRAME, ra
+#if FRAME > 0
+	ldi	sp, -FRAME(sp)
+#endif
+#ifdef PROF
+	.set	macro
+	ldgp	gp, 0(pv)
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.set	nomacro
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	beq	Y, $divbyzero
+	mov     Y,t6
+	nop
+	rfpcr	$f10
+
+	_ITOFT2	X, $f0, 0, Y, $f1, 8
+
+	.align	4
+	fcvtld	$f0, $f11
+	fcvtld	$f1, $f12
+	fdivd	$f11, $f12, $f0
+	unop
+
+	/* Check to see if X fit in the double as an exact value.  */
+	sll	X, (64-53), AT
+	sra	AT, (64-53), AT
+	cmpeq	X, AT, AT
+	beq	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert and clean up.  */
+	fcvtdl_z $f0, $f11
+	nop
+	wfpcr	$f10
+	_FTOIT	$f11, $0, 0
+
+$egress:
+//	mull	$0, Y, $1
+	mull	$0, t6, $1
+	subl	X, $1, $1
+
+	stl	$0, 0($16)
+	stl	$1, 8($16)
+	mov	$16, $0
+
+#if FRAME > 0
+	ldi	sp, FRAME(sp)
+#endif
+	ret
+
+	.align	4
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+
+#define Q	v0		/* quotient */
+#define R	t0		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	/* The fixup code below can only handle unsigned values.  */
+	bis	X, Y, AT
+	mov	$31, t5
+	blt	AT, $fix_sign_in
+$fix_sign_in_ret1:
+	fcvtdl_z $f0, $f11
+
+	_FTOIT	$f11, Q, 8
+$fix_sign_in_ret2:
+	mull	Q, Y, QY
+	nop
+	wfpcr	$f10
+
+	.align	4
+	subl	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subl	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	negl	Q, t4
+	sellbs	t5, t4, Q, Q
+	br	$egress
+
+	.align	4
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subl	Q, S, Q
+	unop
+	subl	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addl	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subl	R, SY, t4
+
+	selne	AT, t3, Q, Q
+	selne	AT, t4, R, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+$fix_sign_in:
+	/* If we got here, then X|Y is negative.  Need to adjust everything
+	   such that we're doing unsigned division in the fixup loop.  */
+	/* T5 is true if result should be negative.  */
+	xor	X, Y, AT
+	cmplt	AT, 0, t5
+	cmplt	X, 0, AT
+	negl	X, t0
+
+	selne	AT, t0, X, X
+	cmplt	Y, 0, AT
+	negl	Y, t0
+
+	selne	AT, t0, Y, Y
+	blbc	t5, $fix_sign_in_ret1
+
+	fcvtdl_z $f0, $f11
+	_FTOIT	$f11, Q, 8
+	.align	3
+	negl	Q, Q
+	br	$fix_sign_in_ret2
+
+$divbyzero:
+	mov	a0, v0
+	ldi	a0, GEN_INTDIV
+	sys_call HMC_gentrap
+	stl	zero, 0(v0)
+	stl	zero, 8(v0)
+
+#if FRAME > 0
+	ldi	sp, FRAME(sp)
+#endif
+	ret
+
+	.end	ldiv
+
+weak_alias (ldiv, lldiv)
+weak_alias (ldiv, imaxdiv)
diff --git a/sysdeps/sw_64/lldiv.S b/sysdeps/sw_64/lldiv.S
new file mode 100644
index 00000000..8a8ef97a
--- /dev/null
+++ b/sysdeps/sw_64/lldiv.S
@@ -0,0 +1 @@
+/* lldiv is the same as ldiv on the Sw_64.  */
diff --git a/sysdeps/sw_64/lshift.S b/sysdeps/sw_64/lshift.S
new file mode 100644
index 00000000..700e9d80
--- /dev/null
+++ b/sysdeps/sw_64/lshift.S
@@ -0,0 +1,107 @@
+ # Sw_64 1621 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.8 cycles/limb on the 1621.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldl and stl can be paired with the other used
+ # instructions.  But there are many restrictions in the 1621 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addl	$18,$17,$17	# make r17 point at end of s1
+	ldl	$4,-8($17)	# load first limb
+	subl	$17,8,$17
+	subl	$31,$19,$7
+	s8addl	$18,$16,$16	# make r16 point at end of RES
+	subl	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	srl	$4,$7,$0	# compute function result
+
+	beq	$20,.L0
+	subl	$18,$20,$18
+
+	.align	3
+.Loop0:
+	ldl	$3,-8($17)
+	subl	$16,8,$16
+	subl	$17,8,$17
+	subl	$20,1,$20
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stl	$8,0($16)
+	bne	$20,.Loop0
+
+.L0:	beq	$18,.Lend
+
+	.align	3
+.Loop:	ldl	$3,-8($17)
+	subl	$16,32,$16
+	subl	$18,4,$18
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+
+	ldl	$4,-16($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stl	$8,24($16)
+	srl	$4,$7,$2
+
+	ldl	$3,-24($17)
+	sll	$4,$19,$5
+	bis	$1,$2,$8
+	stl	$8,16($16)
+	srl	$3,$7,$6
+
+	ldl	$4,-32($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stl	$8,8($16)
+	srl	$4,$7,$2
+
+	subl	$17,32,$17
+	bis	$1,$2,$8
+	stl	$8,0($16)
+
+	bgt	$18,.Loop
+
+.Lend:	sll	$4,$19,$8
+	stl	$8,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/sysdeps/sw_64/mul_1.S b/sysdeps/sw_64/mul_1.S
new file mode 100644
index 00000000..127f4274
--- /dev/null
+++ b/sysdeps/sw_64/mul_1.S
@@ -0,0 +1,82 @@
+ # Sw_64 1621 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+
+ # To improve performance for long fmuldiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Sw_64
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_mul_1
+	.ent	__mpn_mul_1 2
+__mpn_mul_1:
+	.frame	$30,0,$26
+
+	ldl	$2,0($17)	# $2 = s1_limb
+	subl	$18,1,$18	# size--
+	mull	$2,$19,$3	# $3 = prod_low
+	bic	$31,$31,$4	# clear cy_limb
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldl	$2,8($17)	# $2 = s1_limb
+	subl	$18,1,$18	# size--
+	stl	$3,0($16)
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mull	$2,$19,$3	# $3 = prod_low
+	addl	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subl	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldl	$2,16($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	addl	$3,$0,$3	# $3 = cy_limb + prod_low
+	stl	$3,8($16)
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$16,8,$16	# res_ptr++
+	bne	$18,Loop
+
+Lend2:	mull	$2,$19,$3	# $3 = prod_low
+	addl	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addl	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	stl	$3,8($16)
+	addl	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	stl	$3,0($16)
+	ret	$31,($26),1
+
+	.end	__mpn_mul_1
diff --git a/sysdeps/sw_64/reml.S b/sysdeps/sw_64/reml.S
new file mode 100644
index 00000000..56a550d9
--- /dev/null
+++ b/sysdeps/sw_64/reml.S
@@ -0,0 +1,93 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   Contributed by Richard Henderson  <rth@twiddle.net>
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+/* 32-bit signed int remainder.  This is not a normal C function.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   The FPU can handle the division for all input values except zero.
+   All we have to do is compute the remainder via multiply-and-subtract.
+
+   The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+   for cvttq/c even without /sui being set.  It will not, however, properly
+   raise the exception, so we don't have to worry about FPCR_INED being clear
+   and so dying by SIGFPE.  */
+   /*__reml->__remw 20161111*/
+#ifndef EXTEND
+#define EXTEND(S,D)	sextl S, D
+#endif
+
+	.text
+	.align	4
+	.globl	__remw
+	.type	__remw, @funcnoplt
+	.usepv	__remw, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__remw:
+	ldi     sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+	fstd     $f0, 0(sp)
+	excb
+	beq     Y, DIVBYZERO
+
+	fstd    $f1, 8(sp)
+	fstd    $f2, 16(sp)
+	fstd    $f3, 40(sp)
+	fstd    $f4, 48(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	cfi_rel_offset ($f2, 16)
+	cfi_rel_offset ($f3, 40)
+	cfi_rel_offset ($f4, 48)
+
+	rfpcr	$f2
+	EXTEND  (X, RV)
+	EXTEND  (Y, AT)
+	_ITOFT2 RV, $f0, 24, AT, $f1, 32
+	fcvtld  $f0, $f3
+	fcvtld  $f1, $f4
+	fdivd   $f3, $f4, $f0
+	fcvtdl_z  $f0, $f3
+
+	wfpcr	$f2
+	_FTOIT  $f3, RV, 24
+	fldd    $f0, 0(sp)
+	mulw    RV, Y, RV
+	fldd    $f1, 8(sp)
+	fldd    $f2, 16(sp)
+	fldd    $f3, 40(sp)
+	fldd    $f4, 48(sp)
+	ldi     sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_restore ($f2)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_def_cfa_offset (0)
+	subw	X, RV, RV
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__remw, .-__remw
+
+	DO_DIVBYZERO
diff --git a/sysdeps/sw_64/remlu.S b/sysdeps/sw_64/remlu.S
new file mode 100644
index 00000000..3c12f7bf
--- /dev/null
+++ b/sysdeps/sw_64/remlu.S
@@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D)	zapnot S, 15, D
+#define __remw		__remwu
+#include <reml.S>
diff --git a/sysdeps/sw_64/remq.S b/sysdeps/sw_64/remq.S
new file mode 100644
index 00000000..6db7f628
--- /dev/null
+++ b/sysdeps/sw_64/remq.S
@@ -0,0 +1,274 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long remainder.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
+   be clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.
+
+   The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+   for cvttq/c even without /sui being set.  It will not, however, properly
+   raise the exception, so we don't have to worry about FPCR_INED being clear
+   and so dying by SIGFPE.  */
+	.text
+	.align	4
+	.globl	__reml
+	.type	__reml, @funcnoplt
+	.usepv	__reml, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__reml:
+	ldi	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	fstd	$f0, 0(sp)
+	excb
+	beq	Y, DIVBYZERO
+
+	fstd	$f1, 8(sp)
+	fstd	$f3, 48(sp)
+	fstd	$f4, 56(sp)
+	fstd	$f5, 64(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	cfi_rel_offset ($f3, 48)
+	cfi_rel_offset ($f4, 56)
+	cfi_rel_offset ($f5, 64)
+
+	rfpcr	$f3
+	_ITOFT2	X, $f0, 16, Y, $f1, 24
+	fcvtld	$f0, $f4
+	fcvtld	$f1, $f5
+	fdivd	$f4, $f5, $f0
+
+	/* Check to see if X fit in the double as an exact value.  */
+	sll	X, (64-53), AT
+	fldd	$f1, 8(sp)
+	sra	AT, (64-53), AT
+	cmpeq	X, AT, AT
+	beq	AT, $x_big
+        fcvtdl_z $f0, $f4
+
+	wfpcr	$f3
+	_FTOIT	$f4, AT, 16
+	mull	AT, Y, AT
+	fldd	$f0, 0(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	cfi_restore ($f1)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ldi	sp, FRAME(sp)
+	subl	X, AT, RV
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stl	t0, 32(sp)
+	stl	t1, 40(sp)
+	stl	t2, 16(sp)
+	stl	t5, 24(sp)
+	cfi_rel_offset (t0, 32)
+	cfi_rel_offset (t1, 40)
+	cfi_rel_offset (t2, 16)
+	cfi_rel_offset (t5, 24)
+
+#define Q	t0		/* quotient */
+#define R	RV		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	/* The fixup code below can only handle unsigned values.  */
+	or	X, Y, AT
+	mov	$31, t5
+	blt	AT, $fix_sign_in
+$fix_sign_in_ret1:
+	fcvtdl_z $f0, $f4
+	_FTOIT	$f4, Q, 8
+	.align	3
+$fix_sign_in_ret2:
+	fldd	$f0, 0(sp)
+	stl	t3, 0(sp)
+	cfi_restore ($f0)
+	cfi_rel_offset (t3, 0)
+
+	mull	Q, Y, QY
+	stl	t4, 8(sp)
+	wfpcr	$f3
+	cfi_rel_offset (t4, 8)
+
+	subl	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subl	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldl	t0, 32(sp)
+	ldl	t1, 40(sp)
+	ldl	t2, 16(sp)
+	bne	t5, $fix_sign_out
+
+$fix_sign_out_ret:
+	ldl	t3, 0(sp)
+	ldl	t4, 8(sp)
+	ldl	t5, 24(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	ldi	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_restore (t5)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subl	Q, S, Q
+	unop
+	subl	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addl	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subl	R, SY, t4
+
+	selne	AT, t3, Q, Q
+	selne	AT, t4, R, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+$fix_sign_in:
+	/* If we got here, then X|Y is negative.  Need to adjust everything
+	   such that we're doing unsigned division in the fixup loop.  */
+	/* T5 records the changes we had to make:
+		bit 0:	set if X was negated.  Note that the sign of the
+			remainder follows the sign of the divisor.
+		bit 2:	set if Y was negated.
+	*/
+	xor	X, Y, t1
+	cmplt	X, 0, t5
+	negl	X, t0
+	selne	t5, t0, X, X
+
+	cmplt	Y, 0, AT
+	negl	Y, t0
+	s4addl	AT, t5, t5
+	selne	AT, t0, Y, Y
+
+	bge	t1, $fix_sign_in_ret1
+	fcvtdl_z $f0, $f4
+	_FTOIT	$f4, Q, 8
+	.align	3
+	negl	Q, Q
+	br	$fix_sign_in_ret2
+
+	.align	4
+$fix_sign_out:
+	/* Now we get to undo what we did above.  */
+	/* ??? Is this really faster than just increasing the size of
+	   the stack frame and storing X and Y in memory?  */
+	and	t5, 4, AT
+	negl	Y, t4
+	selne	AT, t4, Y, Y
+
+	negl	X, t4
+	sellbs	t5, t4, X, X
+	negl	RV, t4
+	sellbs	t5, t4, RV, RV
+
+	br	$fix_sign_out_ret
+
+	cfi_endproc
+	.size	__reml, .-__reml
+
+	DO_DIVBYZERO
diff --git a/sysdeps/sw_64/remqu.S b/sysdeps/sw_64/remqu.S
new file mode 100644
index 00000000..946e031b
--- /dev/null
+++ b/sysdeps/sw_64/remqu.S
@@ -0,0 +1,292 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long remainder.  These are not normal C functions.  Argument
+   registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
+   clobbered.
+
+   Theory of operation here is that we can use the FPU divider for virtually
+   all operands that we see: all dividend values between -2**53 and 2**53-1
+   can be computed directly.  Note that divisor values need not be checked
+   against that range because the rounded fp value will be close enough such
+   that the quotient is < 1, which will properly be truncated to zero when we
+   convert back to integer.
+
+   When the dividend is outside the range for which we can compute exact
+   results, we use the fp quotent as an estimate from which we begin refining
+   an exact integral value.  This reduces the number of iterations in the
+   shift-and-subtract loop significantly.
+
+   The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+   for cvttq/c even without /sui being set.  It will not, however, properly
+   raise the exception, so we don't have to worry about FPCR_INED being clear
+   and so dying by SIGFPE.  */
+	.text
+	.align	4
+	.globl	__remlu
+	.type	__remlu, @funcnoplt
+	.usepv	__remlu, no
+
+	cfi_startproc
+	cfi_return_column (RA)
+__remlu:
+	ldi	sp, -FRAME(sp)
+	cfi_def_cfa_offset (FRAME)
+	CALL_MCOUNT
+
+	/* Get the fp divide insn issued as quickly as possible.  After
+	   that's done, we have at least 22 cycles until its results are
+	   ready -- all the time in the world to figure out how we're
+	   going to use the results.  */
+	subl	Y, 1, AT
+	and	Y, AT, AT
+	beq	AT, $powerof2
+	fstd	$f0, 0(sp)
+
+
+	fstd	$f1, 8(sp)
+	fstd	$f3, 48(sp)
+	fstd	$f4, 56(sp)
+	fstd	$f5, 64(sp)
+	cfi_rel_offset ($f0, 0)
+	cfi_rel_offset ($f1, 8)
+	cfi_rel_offset ($f3, 48)
+	cfi_rel_offset ($f4, 56)
+	cfi_rel_offset ($f5, 64)
+
+	rfpcr $f3
+	_ITOFT2	X, $f0, 16, Y, $f1, 24
+
+	fcvtld	$f0, $f4
+	fcvtld	$f1, $f5
+
+	blt	X, $x_is_neg
+setfpec1
+	fdivd	$f4, $f5, $f0
+
+	/* Check to see if Y was mis-converted as signed value.  */
+	fldd	$f1, 8(sp)
+	blt	Y, $y_is_neg
+
+	/* Check to see if X fit in the double as an exact value.  */
+	srl	X, 53, AT
+	bne	AT, $x_big
+
+	/* If we get here, we're expecting exact results from the division.
+	   Do nothing else besides convert, compute remainder, clean up.  */
+	fcvtdl_z  $f0, $f4
+	wfpcr $f3
+	_FTOIT	$f4, AT, 16
+	mull	AT, Y, AT
+	fldd	$f0, 0(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	ldi	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore ($f0)
+	cfi_restore ($f1)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+
+	.align	4
+	subl	X, AT, RV
+	ret	$31, (RA), 1
+	.align	4
+	cfi_restore_state
+$x_is_neg:
+	/* If we get here, X is so big that bit 63 is set, which made the
+	   conversion come out negative.  Fix it up lest we not even get
+	   a good estimate.  */
+	ldih	AT, 0x5f80		/* 2**64 as float.  */
+	fstd	$f2, 24(sp)
+	fstd	$f6, 72(sp)
+	cfi_rel_offset ($f2, 24)
+	cfi_rel_offset ($f6, 72)
+	_ITOFS	AT, $f2, 16
+	.align	4
+	faddd	$f4, $f2, $f6
+	fdivd	$f6, $f5, $f0
+
+	/* Ok, we've now the divide issued.  Continue with other checks.  */
+#	.align	4
+	fldd	$f1, 8(sp)
+	unop
+	fldd	$f2, 24(sp)
+	fldd	$f6, 72(sp)
+	blt	Y, $y_is_neg
+	cfi_restore ($f1)
+	cfi_restore ($f2)
+	cfi_restore ($f6)
+	cfi_remember_state	/* for y_is_neg */
+
+	.align	4
+$x_big:
+	/* If we get here, X is large enough that we don't expect exact
+	   results, and neither X nor Y got mis-translated for the fp
+	   division.  Our task is to take the fp result, figure out how
+	   far it's off from the correct result and compute a fixup.  */
+	stl	t0, 32(sp)
+	stl	t1, 40(sp)
+	stl	t2, 16(sp)
+	stl	t3, 24(sp)
+	cfi_rel_offset (t0, 32)
+	cfi_rel_offset (t1, 40)
+	cfi_rel_offset (t2, 16)
+	cfi_rel_offset (t3, 24)
+
+#define Q	t0		/* quotient */
+#define R	RV		/* remainder */
+#define SY	t1		/* scaled Y */
+#define S	t2		/* scalar */
+#define QY	t3		/* Q*Y */
+
+	fcvtdl_z $f0, $f4
+	_FTOIT	$f4, Q, 8
+	mull	Q, Y, QY
+
+	.align	4
+	stl	t4, 8(sp)
+	excb
+	fldd	$f0, 0(sp)
+	wfpcr	$f3
+	cfi_rel_offset (t4, 8)
+	cfi_restore ($f0)
+
+	subl	QY, X, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_high
+
+$q_high_ret:
+	subl	X, QY, R
+	mov	Y, SY
+	mov	1, S
+	bgt	R, $q_low
+
+$q_low_ret:
+	ldl	t4, 8(sp)
+	ldl	t0, 32(sp)
+	ldl	t1, 40(sp)
+	ldl	t2, 16(sp)
+
+	ldl	t3, 24(sp)
+	fldd	$f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	ldi	sp, FRAME(sp)
+	cfi_remember_state
+	cfi_restore (t0)
+	cfi_restore (t1)
+	cfi_restore (t2)
+	cfi_restore (t3)
+	cfi_restore (t4)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	.align	4
+	cfi_restore_state
+	/* The quotient that we computed was too large.  We need to reduce
+	   it by S such that Y*S >= R.  Obviously the closer we get to the
+	   correct value the better, but overshooting high is ok, as we'll
+	   fix that up later.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_high:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	subl	Q, S, Q
+	unop
+	subl	QY, SY, QY
+	br	$q_high_ret
+
+	.align	4
+	/* The quotient that we computed was too small.  Divide Y by the
+	   current remainder (R) and add that to the existing quotient (Q).
+	   The expectation, of course, is that R is much smaller than X.  */
+	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+	   already have a copy of Y in SY and the value 1 in S.  */
+0:
+	addl	SY, SY, SY
+	addl	S, S, S
+$q_low:
+	cmpult	SY, R, AT
+	bne	AT, 0b
+
+	/* Shift-down and subtract loop.  Each iteration compares our scaled
+	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+	   Y's scalar (S) so add it to the quotient (Q).  */
+2:	addl	Q, S, t3
+	srl	S, 1, S
+	cmpule	SY, R, AT
+	subl	R, SY, t4
+
+	selne	AT, t3, Q, Q
+	selne	AT, t4, R, R
+	srl	SY, 1, SY
+	bne	S, 2b
+
+	br	$q_low_ret
+
+	.align	4
+	cfi_restore_state
+$y_is_neg:
+	/* If we get here, Y is so big that bit 63 is set.  The results
+	   from the divide will be completely wrong.  Fortunately, the
+	   quotient must be either 0 or 1, so the remainder must be X
+	   or X-Y, so just compute it directly.  */
+	cmpule	Y, X, AT
+	nop
+	wfpcr	$f3
+	subl	X, Y, RV
+	fldd	$f0, 0(sp)
+	fldd    $f3, 48(sp)
+	fldd	$f4, 56(sp)
+	fldd	$f5, 64(sp)
+	seleq	AT, X, RV, RV
+
+	ldi	sp, FRAME(sp)
+	cfi_restore ($f0)
+	cfi_restore ($f3)
+	cfi_restore ($f4)
+	cfi_restore ($f5)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+	.align	4
+	cfi_def_cfa_offset (FRAME)
+$powerof2:
+	subl	Y, 1, AT
+	beq	Y, DIVBYZERO
+	and	X, AT, RV
+	ldi	sp, FRAME(sp)
+	cfi_def_cfa_offset (0)
+	ret	$31, (RA), 1
+
+	cfi_endproc
+	.size	__remlu, .-__remlu
+
+	DO_DIVBYZERO
diff --git a/sysdeps/sw_64/rshift.S b/sysdeps/sw_64/rshift.S
new file mode 100644
index 00000000..81b3d742
--- /dev/null
+++ b/sysdeps/sw_64/rshift.S
@@ -0,0 +1,105 @@
+ # Sw_64 1621 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.8 cycles/limb on the 1621.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldl and stl can be paired with the other used
+ # instructions.  But there are many restrictions in the 1621 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldl	$4,0($17)	# load first limb
+	addl	$17,8,$17
+	subl	$31,$19,$7
+	subl	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	sll	$4,$7,$0	# compute function result
+
+	beq	$20,.L0
+	subl	$18,$20,$18
+
+	.align	3
+.Loop0:
+	ldl	$3,0($17)
+	addl	$16,8,$16
+	addl	$17,8,$17
+	subl	$20,1,$20
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stl	$8,-8($16)
+	bne	$20,.Loop0
+
+.L0:	beq	$18,.Lend
+
+	.align	3
+.Loop:	ldl	$3,0($17)
+	addl	$16,32,$16
+	subl	$18,4,$18
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+
+	ldl	$4,8($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stl	$8,-32($16)
+	sll	$4,$7,$2
+
+	ldl	$3,16($17)
+	srl	$4,$19,$5
+	bis	$1,$2,$8
+	stl	$8,-24($16)
+	sll	$3,$7,$6
+
+	ldl	$4,24($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stl	$8,-16($16)
+	sll	$4,$7,$2
+
+	addl	$17,32,$17
+	bis	$1,$2,$8
+	stl	$8,-8($16)
+
+	bgt	$18,.Loop
+
+.Lend:	srl	$4,$19,$8
+	stl	$8,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/sysdeps/sw_64/sub_n.S b/sysdeps/sw_64/sub_n.S
new file mode 100644
index 00000000..d0d5a30c
--- /dev/null
+++ b/sysdeps/sw_64/sub_n.S
@@ -0,0 +1,118 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	ldl	$3,0($17)
+	ldl	$4,0($18)
+
+	subl	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if fmuldiple of 4 limbs, skip first loop
+
+	subl	$19,$2,$19
+
+.Loop0:	subl	$2,1,$2
+	ldl	$5,8($17)
+	addl	$4,$0,$4
+	ldl	$6,8($18)
+	cmpult	$4,$0,$1
+	subl	$3,$4,$4
+	cmpult	$3,$4,$0
+	stl	$4,0($16)
+	or	$0,$1,$0
+
+	addl	$17,8,$17
+	addl	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addl	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subl	$19,4,$19
+
+	ldl	$5,8($17)
+	addl	$4,$0,$4
+	ldl	$6,8($18)
+	cmpult	$4,$0,$1
+	subl	$3,$4,$4
+	cmpult	$3,$4,$0
+	stl	$4,0($16)
+	or	$0,$1,$0
+
+	ldl	$3,16($17)
+	addl	$6,$0,$6
+	ldl	$4,16($18)
+	cmpult	$6,$0,$1
+	subl	$5,$6,$6
+	cmpult	$5,$6,$0
+	stl	$6,8($16)
+	or	$0,$1,$0
+
+	ldl	$5,24($17)
+	addl	$4,$0,$4
+	ldl	$6,24($18)
+	cmpult	$4,$0,$1
+	subl	$3,$4,$4
+	cmpult	$3,$4,$0
+	stl	$4,16($16)
+	or	$0,$1,$0
+
+	ldl	$3,32($17)
+	addl	$6,$0,$6
+	ldl	$4,32($18)
+	cmpult	$6,$0,$1
+	subl	$5,$6,$6
+	cmpult	$5,$6,$0
+	stl	$6,24($16)
+	or	$0,$1,$0
+
+	addl	$17,32,$17
+	addl	$18,32,$18
+	addl	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addl	$4,$0,$4
+	cmpult	$4,$0,$1
+	subl	$3,$4,$4
+	cmpult	$3,$4,$0
+	stl	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_sub_n
diff --git a/sysdeps/sw_64/submul_1.S b/sysdeps/sw_64/submul_1.S
new file mode 100644
index 00000000..2cad2bef
--- /dev/null
+++ b/sysdeps/sw_64/submul_1.S
@@ -0,0 +1,89 @@
+ # Sw_64 1621 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # fsubdract the result from a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_submul_1
+	.ent	__mpn_submul_1 2
+__mpn_submul_1:
+	.frame	$30,0,$26
+
+	ldl	$2,0($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	subl	$18,1,$18	# size--
+	mull	$2,$19,$3	# $3 = prod_low
+	ldl	$5,0($16)	# $5 = *res_ptr
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,.Lend1	# jump if size was == 1
+	ldl	$2,0($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	subl	$18,1,$18	# size--
+	subl	$5,$3,$3
+	cmpult	$5,$3,$4
+	stl	$3,0($16)
+	addl	$16,8,$16	# res_ptr++
+	beq	$18,.Lend2	# jump if size was == 2
+
+	.align	3
+.Loop:	mull	$2,$19,$3	# $3 = prod_low
+	ldl	$5,0($16)	# $5 = *res_ptr
+	addl	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subl	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldl	$2,0($17)	# $2 = s1_limb
+	addl	$17,8,$17	# s1_ptr++
+	addl	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	subl	$5,$3,$3
+	cmpult	$5,$3,$5
+	stl	$3,0($16)
+	addl	$16,8,$16	# res_ptr++
+	addl	$5,$0,$0	# combine carries
+	bne	$18,.Loop
+
+.Lend2:	mull	$2,$19,$3	# $3 = prod_low
+	ldl	$5,0($16)	# $5 = *res_ptr
+	addl	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addl	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	subl	$5,$3,$3
+	cmpult	$5,$3,$5
+	stl	$3,0($16)
+	addl	$5,$0,$0	# combine carries
+	addl	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+.Lend1:	subl	$5,$3,$3
+	cmpult	$5,$3,$5
+	stl	$3,0($16)
+	addl	$0,$5,$0
+	ret	$31,($26),1
+
+	.end	__mpn_submul_1
diff --git a/sysdeps/sw_64/sw6a/add_n.S b/sysdeps/sw_64/sw6a/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subl	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldl	$0,0($18)
+	ldl	$1,8($18)
+	ldl	$4,0($17)
+	ldl	$5,8($17)
+	addl	$17,32,$17		# update s1_ptr
+	ldl	$2,16($18)
+	addl	$0,$4,$20		# 1st main add
+	ldl	$3,24($18)
+	subl	$19,4,$19		# decr loop cnt
+	ldl	$6,-16($17)
+	cmpult	$20,$0,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	addl	$1,$25,$28		# cy add
+	addl	$18,32,$18		# update s2_ptr
+	addl	$5,$28,$21		# 2nd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
+	ldl	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two fadds
+	ldl	$1,8($18)
+	addl	$2,$25,$28		# cy add
+	ldl	$4,0($17)
+	addl	$28,$6,$22		# 3rd main add
+	ldl	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	addl	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	addl	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	addl	$0,$25,$28		# cy add
+	ldl	$2,16($18)
+	addl	$4,$28,$20		# 1st main add
+	ldl	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldl	$6,-16($17)
+	cmpult	$20,$28,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two fadds
+	subl	$19,4,$19		# decr loop cnt
+	stl	$22,-16($16)
+	addl	$1,$25,$28		# cy add
+	stl	$23,-8($16)
+	addl	$5,$28,$21		# 2nd main add
+	addl	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$21,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$2,$25,$28		# cy add
+	addl	$28,$6,$22		# 3rd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	addl	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	stl	$22,-16($16)
+	stl	$23,-8($16)
+.Lend2:	addl	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldl	$0,0($18)
+	ldl	$4,0($17)
+	subl	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addl	$0,$25,$28		# cy add
+	ldl	$0,8($18)
+	addl	$4,$28,$20		# main add
+	ldl	$4,8($17)
+	addl	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addl	$17,8,$17
+	stl	$20,0($16)
+	cmpult	$20,$28,$25		# compute cy from last add
+	subl	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,8,$16
+	bne	$19,.Loop0
+.Lend0:	addl	$0,$25,$28		# cy add
+	addl	$4,$28,$20		# main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$20,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_add_n
diff --git a/sysdeps/sw_64/sw6a/addmul_1.S b/sysdeps/sw_64/sw6a/addmul_1.S
new file mode 100644
index 00000000..287e8573
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ #  Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ #  This file is part of the GNU MP Library.
+ #
+ #  The GNU MP Library is free software; you can redistribute it and/or modify
+ #  it under the terms of the GNU Lesser General Public License as published
+ #  by the Free Software Foundation; either version 2.1 of the License, or (at
+ #  your option) any later version.
+ #
+ #  The GNU MP Library is distributed in the hope that it will be useful, but
+ #  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ #  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ #  License for more details.
+ #
+ #  You should have received a copy of the GNU Lesser General Public License
+ #  along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+ #  INPUT PARAMETERS
+ #  res_ptr	$16
+ #  s1_ptr	$17
+ #  size	$18
+ #  s2_limb	$19
+ #
+ #
+ # This code was written in close cooperation with  pipeline expert
+ # .  Any errors are tege's fault, though.
+ #
+ #   Register usages for unrolled loop:
+ #	  0-3     mul's
+ #	  4-7     acc's
+ #	  8-15    mul results
+ #	  20,21   carry's
+ #	  22,23   save for stores
+ #
+ #   Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ #   The stores can issue a cycle late so we have paired no-op's to 'catch'
+ #   them, so that further disturbance to the schedule is damped.
+ #
+ #   We couldn't pair the loads, because the entangled schedule of the
+ #   carry's has to happen on one side {0} of the machine.  Note, the total
+ #   use of U0, and the total use of L0 (after attending to the stores).
+ #   which is part of the reason why....
+ #
+ #   This is a great schedule for the d_cache, a poor schedule for the
+ #   b_cache.  The lockup on U0 means that any stall can't be recovered
+ #   from.  Consider a ldl in L1.  say that load gets stalled because it
+ #   collides with a fill from the b_Cache.  On the next cycle, this load
+ #   gets priority.  If first looks at L0, and goes there.  The instruction
+ #   we intended for L0 gets to look at L1, which is NOT where we want
+ #   it.  It either stalls 1, because it can't go in L0, or goes there, and
+ #   causes a further instruction to stall.
+ #
+ #   So for b_cache, we're likely going to want to put one or more cycles
+ #   back into the code! And, of course, put in prefetches.  For the
+ #   accumulator, flds, intent to modify.  For the fmuldiplier, you might
+ #   want ldl, evict next, if you're not wanting to use it again soon.  Use
+ #   256 ahead of present pointer value.  At a place where we have an mt
+ #   followed by a bookkeeping, put the bookkeeping in upper, and the
+ #   prefetch into lower.
+ #
+ #   Note, the usage of physical registers per cycle is smoothed off, as
+ #   much as possible.
+ #
+ #   Note, the ldl's and stl's are at the end of the quadpacks.  note, we'd
+ #   like not to have a ldl or stl to preceded a conditional branch in a
+ #   quadpack.  The conditional branch moves the retire pointer one cycle
+ #   later.
+ #
+ #   Optimization notes:
+ #   Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ #   Reserved regs:	 $29 $30 $31
+ #   Free caller-saves regs in unrolled code: $24 $25 $28
+ #   We should swap some of the callee-saves regs for some of the free
+ #   caller-saves regs, saving some overhead cycles.
+ #   Most importantly, we should write fast code for the 0-7 case.
+ #   The code we use there are for the 21164, and runs at 7 cycles/limb
+ #   on the 21264.  Should not be hard, if we write specialized code for
+ #   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
+ #   need a jump table indexed by the low 3 bits of the count argument.
+
+	.set	noreorder
+	.set	noat
+	.text
+
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.frame	$30,0,$26,0
+	.prologue 0
+
+	cmpult	$18,	8,	$1
+	beq	$1,	$Large
+
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$18,	1,	$18	# size--
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	umulh	$2,	$19,	$0	# $0 = prod_high
+	beq	$18,	$Lend0b		# jump if size was == 1
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$18,	1,	$18	# size--
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$4
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	beq	$18,	$Lend0a		# jump if size was == 2
+
+	.align 3
+$Loop0:	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	subl	$18,	1,	$18	# size--
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	bne	$18,	$Loop0
+$Lend0a:
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$5,	$0,	$0	# combine carries
+	addl	$4,	$0,	$0	# cy_limb = prod_high + cy
+	ret	$31,	($26),	1
+$Lend0b:
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$0,	$5,	$0
+	ret	$31,	($26),	1
+
+$Large:
+	ldi	$30,	-240($30)
+	stl	$9,	8($30)
+	stl	$10,	16($30)
+	stl	$11,	24($30)
+	stl	$12,	32($30)
+	stl	$13,	40($30)
+	stl	$14,	48($30)
+	stl	$15,	56($30)
+
+	and	$18,	7,	$20	# count for the first loop, 0-7
+	srl	$18,	3,	$18	# count for unrolled loop
+	bis	$31,	$31,	$0
+	beq	$20,	$Lunroll
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$20,	1,	$20	# size--
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	umulh	$2,	$19,	$0	# $0 = prod_high
+	beq	$20,	$Lend1b		# jump if size was == 1
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$20,	1,	$20	# size--
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$4
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	beq	$20,	$Lend1a		# jump if size was == 2
+
+	.align 3
+$Loop1:	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	subl	$20,	1,	$20	# size--
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	bne	$20,	$Loop1
+
+$Lend1a:
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	addl	$4,	$0,	$0	# cy_limb = prod_high + cy
+	br	$31,	$Lunroll
+$Lend1b:
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$0,	$5,	$0
+
+$Lunroll:
+	ldi	$17,	-16($17)	# L1 bookkeeping
+	ldi	$16,	-16($16)	# L1 bookkeeping
+	bis	$0,	$31,	$12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+	ldl	$2,	16($17)		# L1
+	ldl	$3,	24($17)		# L1
+	ldi	$18,	-1($18)		# L1 bookkeeping
+	ldl	$6,	16($16)		# L1
+	ldl	$7,	24($16)		# L1
+	ldl	$0,	32($17)		# L1
+	mull	$19,	$2,	$13	# U1
+	ldl	$1,	40($17)		# L1
+	umulh	$19,	$2,	$14	# U1
+	mull	$19,	$3,	$15	# U1
+	ldi	$17,	64($17)		# L1 bookkeeping
+	ldl	$4,	32($16)		# L1
+	ldl	$5,	40($16)		# L1
+	umulh	$19,	$3,	$8	# U1
+	ldl	$2,	-16($17)	# L1
+	mull	$19,	$0,	$9	# U1
+	ldl	$3,	-8($17)		# L1
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	mull	$19,	$1,	$11	# U1
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	ldi	$16,	64($16)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	-16($16)	# L1
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	-8($16)		# L1
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	0($17)		# L1
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	8($17)		# L1
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# L0 lo + acc
+	stl	$22,	-48($16)	# L0
+	stl	$23,	-40($16)	# L1
+	mull	$19,	$3,	$15	# U1
+	addl	$8,	$21,	$8	# U0 hi mul + carry
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	ble	$18,	$Lend		# U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+	.align 4
+$Loop:
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	0($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	8($16)		# L1
+
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	ldl	$2,	16($17)		# L1
+
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	ldl	$3,	24($17)		# L1
+
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	-32($16)	# L0
+	stl	$23,	-24($16)	# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$1,	$11	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$18,	-1($18)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	16($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	24($16)		# L1
+
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	32($17)		# L1
+
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	40($17)		# L1
+
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# U0 lo + acc
+	stl	$22,	-16($16)	# L0
+	stl	$23,	-8($16)		# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$3,	$15	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$8,	$21,	$8	# L0 hi mul + carry
+
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$17,	64($17)		# L1 bookkeeping
+	addl	$4,	$8,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	32($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	40($16)		# L1
+
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	ldl	$2,	-16($17)	# L1
+
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	ldl	$3,	-8($17)		# L1
+
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	0($16)		# L0
+	stl	$23,	8($16)		# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$1,	$11	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$16,	64($16)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	-16($16)	# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	-8($16)		# L1
+
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	0($17)		# L1
+
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	8($17)		# L1
+
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# L0 lo + acc
+	stl	$22,	-48($16)	# L0
+	stl	$23,	-40($16)	# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$3,	$15	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$8,	$21,	$8	# U0 hi mul + carry
+
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	bis	$31,	$31,	$31	# L1 mt
+	bgt	$18,	$Loop		# U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	0($16)		# L1
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	8($16)		# L1
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	-32($16)	# L0
+	stl	$23,	-24($16)	# L1
+	mull	$19,	$1,	$11	# U1
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	addl	$6,	$12,	$22	# U0 hi add => answer
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	addl	$4,	$9,	$4	# U0 lo + acc
+	stl	$22,	-16($16)	# L0
+	stl	$23,	-8($16)		# L1
+	bis	$31,	$31,	$31	# L0 st slosh
+	addl	$8,	$21,	$8	# L0 hi mul + carry
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	stl	$22,	0($16)		# L0
+	stl	$23,	8($16)		# L1
+	addl	$12,	$21,	$0	# U0 hi mul + carry
+
+	ldl	$9,	8($30)
+	ldl	$10,	16($30)
+	ldl	$11,	24($30)
+	ldl	$12,	32($30)
+	ldl	$13,	40($30)
+	ldl	$14,	48($30)
+	ldl	$15,	56($30)
+	ldi	$30,	240($30)
+	ret	$31,	($26),	1
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/sw_64/sw6a/lshift.S b/sysdeps/sw_64/sw6a/lshift.S
new file mode 100644
index 00000000..cc00593c
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64  __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addl	$18,$17,$17	# make r17 point at end of s1
+	ldl	$4,-8($17)	# load first limb
+	subl	$31,$19,$20
+	s8addl	$18,$16,$16	# make r16 point at end of RES
+	subl	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	srl	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subl	$18,$28,$18
+
+	.align	3
+.Loop0:	ldl	$3,-16($17)
+	subl	$16,8,$16
+	sll	$4,$19,$5
+	subl	$17,8,$17
+	subl	$28,1,$28
+	srl	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stl	$8,0($16)
+	bne	$28,.Loop0
+
+.L0:	sll	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldl	$1,-16($17)
+	subl	$18,4,$18
+	ldl	$2,-24($17)
+	ldl	$3,-32($17)
+	ldl	$4,-40($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	ldl	$1,-48($17)
+	sll	$2,$19,$22
+	ldl	$2,-56($17)
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	ldl	$3,-64($17)
+	sll	$4,$19,$24
+	ldl	$4,-72($17)
+	subl	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+
+	srl	$1,$20,$7
+	subl	$18,4,$18
+	sll	$1,$19,$21
+	unop	# ldl	$31,-96($17)
+
+	srl	$2,$20,$8
+	ldl	$1,-80($17)
+	sll	$2,$19,$22
+	ldl	$2,-88($17)
+
+	stl	$5,-24($16)
+	or	$7,$24,$7
+	stl	$6,-32($16)
+	or	$8,$21,$8
+
+	srl	$3,$20,$5
+	unop	# ldl	$31,-96($17)
+	sll	$3,$19,$23
+	subl	$16,32,$16
+
+	srl	$4,$20,$6
+	ldl	$3,-96($17)
+	sll	$4,$19,$24
+	ldl	$4,-104($17)
+
+	subl	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	stl	$5,-24($16)
+	or	$7,$24,$7
+	stl	$6,-32($16)
+	or	$8,$21,$8
+	srl	$3,$20,$5
+	sll	$3,$19,$23
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 2/2
+	stl	$7,-40($16)
+	or	$5,$22,$5
+	stl	$8,-48($16)
+	or	$6,$23,$6
+	stl	$5,-56($16)
+	stl	$6,-64($16)
+ # cool down phase 2/3
+	stl	$24,-72($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 1/2
+	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+	stl	$5,-24($16)
+	stl	$6,-32($16)
+	stl	$24,-40($16)
+	ret	$31,($26),1
+
+.Lend:	stl	$24,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/sysdeps/sw_64/sw6a/rshift.S b/sysdeps/sw_64/sw6a/rshift.S
new file mode 100644
index 00000000..416c3903
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldl	$4,0($17)	# load first limb
+	subl	$31,$19,$20
+	subl	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	sll	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subl	$18,$28,$18
+
+	.align	3
+.Loop0:	ldl	$3,8($17)
+	addl	$16,8,$16
+	srl	$4,$19,$5
+	addl	$17,8,$17
+	subl	$28,1,$28
+	sll	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stl	$8,-8($16)
+	bne	$28,.Loop0
+
+.L0:	srl	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldl	$1,8($17)
+	subl	$18,4,$18
+	ldl	$2,16($17)
+	ldl	$3,24($17)
+	ldl	$4,32($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	ldl	$1,40($17)
+	srl	$2,$19,$22
+	ldl	$2,48($17)
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	ldl	$3,56($17)
+	srl	$4,$19,$24
+	ldl	$4,64($17)
+	subl	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+
+	sll	$1,$20,$7
+	subl	$18,4,$18
+	srl	$1,$19,$21
+	unop	# ldl	$31,-96($17)
+
+	sll	$2,$20,$8
+	ldl	$1,72($17)
+	srl	$2,$19,$22
+	ldl	$2,80($17)
+
+	stl	$5,16($16)
+	or	$7,$24,$7
+	stl	$6,24($16)
+	or	$8,$21,$8
+
+	sll	$3,$20,$5
+	unop	# ldl	$31,-96($17)
+	srl	$3,$19,$23
+	addl	$16,32,$16
+
+	sll	$4,$20,$6
+	ldl	$3,88($17)
+	srl	$4,$19,$24
+	ldl	$4,96($17)
+
+	addl	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	stl	$5,16($16)
+	or	$7,$24,$7
+	stl	$6,24($16)
+	or	$8,$21,$8
+	sll	$3,$20,$5
+	srl	$3,$19,$23
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 2/2
+	stl	$7,32($16)
+	or	$5,$22,$5
+	stl	$8,40($16)
+	or	$6,$23,$6
+	stl	$5,48($16)
+	stl	$6,56($16)
+ # cool down phase 2/3
+	stl	$24,64($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 1/2
+	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+	stl	$5,16($16)
+	stl	$6,24($16)
+	stl	$24,32($16)
+	ret	$31,($26),1
+
+.Lend:	stl	$24,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/sysdeps/sw_64/sw6a/sub_n.S b/sysdeps/sw_64/sw6a/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subl	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldl	$0,0($18)
+	ldl	$1,8($18)
+	ldl	$4,0($17)
+	ldl	$5,8($17)
+	addl	$17,32,$17		# update s1_ptr
+	ldl	$2,16($18)
+	subl	$4,$0,$20		# 1st main sub
+	ldl	$3,24($18)
+	subl	$19,4,$19		# decr loop cnt
+	ldl	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last sub
+	ldl	$7,-8($17)
+	addl	$1,$25,$28		# cy add
+	addl	$18,32,$18		# update s2_ptr
+	subl	$5,$28,$21		# 2nd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$5,$21,$25		# compute cy from last add
+	ldl	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two fadds
+	ldl	$1,8($18)
+	addl	$2,$25,$28		# cy add
+	ldl	$4,0($17)
+	subl	$6,$28,$22		# 3rd main sub
+	ldl	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	subl	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	addl	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	addl	$0,$25,$28		# cy add
+	ldl	$2,16($18)
+	subl	$4,$28,$20		# 1st main sub
+	ldl	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldl	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two fadds
+	subl	$19,4,$19		# decr loop cnt
+	stl	$22,-16($16)
+	addl	$1,$25,$28		# cy add
+	stl	$23,-8($16)
+	subl	$5,$28,$21		# 2nd main sub
+	addl	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$5,$21,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$2,$25,$28		# cy add
+	subl	$6,$28,$22		# 3rd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	subl	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	stl	$22,-16($16)
+	stl	$23,-8($16)
+.Lend2:	addl	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldl	$0,0($18)
+	ldl	$4,0($17)
+	subl	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addl	$0,$25,$28		# cy add
+	ldl	$0,8($18)
+	subl	$4,$28,$20		# main sub
+	ldl	$1,8($17)
+	addl	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addl	$17,8,$17
+	stl	$20,0($16)
+	cmpult	$4,$20,$25		# compute cy from last add
+	subl	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,8,$16
+	or	$1,$31,$4
+	bne	$19,.Loop0
+.Lend0:	addl	$0,$25,$28		# cy add
+	subl	$4,$28,$20		# main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$4,$20,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_sub_n
diff --git a/sysdeps/sw_64/sw6b/add_n.S b/sysdeps/sw_64/sw6b/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subl	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldl	$0,0($18)
+	ldl	$1,8($18)
+	ldl	$4,0($17)
+	ldl	$5,8($17)
+	addl	$17,32,$17		# update s1_ptr
+	ldl	$2,16($18)
+	addl	$0,$4,$20		# 1st main add
+	ldl	$3,24($18)
+	subl	$19,4,$19		# decr loop cnt
+	ldl	$6,-16($17)
+	cmpult	$20,$0,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	addl	$1,$25,$28		# cy add
+	addl	$18,32,$18		# update s2_ptr
+	addl	$5,$28,$21		# 2nd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
+	ldl	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two fadds
+	ldl	$1,8($18)
+	addl	$2,$25,$28		# cy add
+	ldl	$4,0($17)
+	addl	$28,$6,$22		# 3rd main add
+	ldl	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	addl	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	addl	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	addl	$0,$25,$28		# cy add
+	ldl	$2,16($18)
+	addl	$4,$28,$20		# 1st main add
+	ldl	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldl	$6,-16($17)
+	cmpult	$20,$28,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two fadds
+	subl	$19,4,$19		# decr loop cnt
+	stl	$22,-16($16)
+	addl	$1,$25,$28		# cy add
+	stl	$23,-8($16)
+	addl	$5,$28,$21		# 2nd main add
+	addl	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$21,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$2,$25,$28		# cy add
+	addl	$28,$6,$22		# 3rd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	addl	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	stl	$22,-16($16)
+	stl	$23,-8($16)
+.Lend2:	addl	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldl	$0,0($18)
+	ldl	$4,0($17)
+	subl	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addl	$0,$25,$28		# cy add
+	ldl	$0,8($18)
+	addl	$4,$28,$20		# main add
+	ldl	$4,8($17)
+	addl	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addl	$17,8,$17
+	stl	$20,0($16)
+	cmpult	$20,$28,$25		# compute cy from last add
+	subl	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,8,$16
+	bne	$19,.Loop0
+.Lend0:	addl	$0,$25,$28		# cy add
+	addl	$4,$28,$20		# main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$20,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_add_n
diff --git a/sysdeps/sw_64/sw6b/addmul_1.S b/sysdeps/sw_64/sw6b/addmul_1.S
new file mode 100644
index 00000000..a288f040
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ #  Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ #  This file is part of the GNU MP Library.
+ #
+ #  The GNU MP Library is free software; you can redistribute it and/or modify
+ #  it under the terms of the GNU Lesser General Public License as published
+ #  by the Free Software Foundation; either version 2.1 of the License, or (at
+ #  your option) any later version.
+ #
+ #  The GNU MP Library is distributed in the hope that it will be useful, but
+ #  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ #  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ #  License for more details.
+ #
+ #  You should have received a copy of the GNU Lesser General Public License
+ #  along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+ #  INPUT PARAMETERS
+ #  res_ptr	$16
+ #  s1_ptr	$17
+ #  size	$18
+ #  s2_limb	$19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # .  Any errors are tege's fault, though.
+ #
+ #   Register usages for unrolled loop:
+ #	  0-3     mul's
+ #	  4-7     acc's
+ #	  8-15    mul results
+ #	  20,21   carry's
+ #	  22,23   save for stores
+ #
+ #   Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ #   The stores can issue a cycle late so we have paired no-op's to 'catch'
+ #   them, so that further disturbance to the schedule is damped.
+ #
+ #   We couldn't pair the loads, because the entangled schedule of the
+ #   carry's has to happen on one side {0} of the machine. Note, the total
+ #   use of U0, and the total use of L0 (after attending to the stores).
+ #   which is part of the reason why....
+ #
+ #   This is a great schedule for the d_cache, a poor schedule for the
+ #   b_cache. The lockup on U0 means that any stall can't be recovered
+ #   from. Consider a ldl in L1.  say that load gets stalled because it
+ #   collides with a fill from the b_Cache. On the next cycle, this load
+ #   gets priority. If first looks at L0, and goes there. The instruction
+ #   we intended for L0 gets to look at L1, which is NOT where we want
+ #   it. It either stalls 1, because it can't go in L0, or goes there, and
+ #   causes a further instruction to stall.
+ #
+ #   So for b_cache, we're likely going to want to put one or more cycles
+ #   back into the code! And, of course, put in prefetches. For the
+ #   accumulator, flds, intent to modify.  For the fmuldiplier, you might
+ #   want ldl, evict next, if you're not wanting to use it again soon. Use
+ #   256 ahead of present pointer value. At a place where we have an mt
+ #   followed by a bookkeeping, put the bookkeeping in upper, and the
+ #   prefetch into lower.
+ #
+ #   Note, the usage of physical registers per cycle is smoothed off, as
+ #   much as possible.
+ #
+ #   Note, the ldl's and stl's are at the end of the quadpacks.  note, we'd
+ #   like not to have a ldl or stl to preceded a conditional branch in a
+ #   quadpack. The conditional branch moves the retire pointer one cycle
+ #   later.
+ #
+ #   Optimization notes:
+ #   Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ #   Reserved regs:	 $29 $30 $31
+ #   Free caller-saves regs in unrolled code: $24 $25 $28
+ #   We should swap some of the callee-saves regs for some of the free
+ #   caller-saves regs, saving some overhead cycles.
+ #   Most importantly, we should write fast code for the 0-7 case.
+ #   The code we use there are for the 21164, and runs at 7 cycles/limb
+ #   on the 21264.  Should not be hard, if we write specialized code for
+ #   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
+ #   need a jump table indexed by the low 3 bits of the count argument.
+
+	.set	noreorder
+	.set	noat
+	.text
+
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.frame	$30,0,$26,0
+	.prologue 0
+
+	cmpult	$18,	8,	$1
+	beq	$1,	$Large
+
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$18,	1,	$18	# size--
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	umulh	$2,	$19,	$0	# $0 = prod_high
+	beq	$18,	$Lend0b		# jump if size was == 1
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$18,	1,	$18	# size--
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$4
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	beq	$18,	$Lend0a		# jump if size was == 2
+
+	.align 3
+$Loop0:	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	subl	$18,	1,	$18	# size--
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	bne	$18,	$Loop0
+$Lend0a:
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$5,	$0,	$0	# combine carries
+	addl	$4,	$0,	$0	# cy_limb = prod_high + cy
+	ret	$31,	($26),	1
+$Lend0b:
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$0,	$5,	$0
+	ret	$31,	($26),	1
+
+$Large:
+	ldi	$30,	-240($30)
+	stl	$9,	8($30)
+	stl	$10,	16($30)
+	stl	$11,	24($30)
+	stl	$12,	32($30)
+	stl	$13,	40($30)
+	stl	$14,	48($30)
+	stl	$15,	56($30)
+
+	and	$18,	7,	$20	# count for the first loop, 0-7
+	srl	$18,	3,	$18	# count for unrolled loop
+	bis	$31,	$31,	$0
+	beq	$20,	$Lunroll
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$20,	1,	$20	# size--
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	umulh	$2,	$19,	$0	# $0 = prod_high
+	beq	$20,	$Lend1b		# jump if size was == 1
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$20,	1,	$20	# size--
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$4
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	beq	$20,	$Lend1a		# jump if size was == 2
+
+	.align 3
+$Loop1:	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	subl	$20,	1,	$20	# size--
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	bne	$20,	$Loop1
+
+$Lend1a:
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	addl	$4,	$0,	$0	# cy_limb = prod_high + cy
+	br	$31,	$Lunroll
+$Lend1b:
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$0,	$5,	$0
+
+$Lunroll:
+	ldi	$17,	-16($17)	# L1 bookkeeping
+	ldi	$16,	-16($16)	# L1 bookkeeping
+	bis	$0,	$31,	$12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+	ldl	$2,	16($17)		# L1
+	ldl	$3,	24($17)		# L1
+	ldi	$18,	-1($18)		# L1 bookkeeping
+	ldl	$6,	16($16)		# L1
+	ldl	$7,	24($16)		# L1
+	ldl	$0,	32($17)		# L1
+	mull	$19,	$2,	$13	# U1
+	ldl	$1,	40($17)		# L1
+	umulh	$19,	$2,	$14	# U1
+	mull	$19,	$3,	$15	# U1
+	ldi	$17,	64($17)		# L1 bookkeeping
+	ldl	$4,	32($16)		# L1
+	ldl	$5,	40($16)		# L1
+	umulh	$19,	$3,	$8	# U1
+	ldl	$2,	-16($17)	# L1
+	mull	$19,	$0,	$9	# U1
+	ldl	$3,	-8($17)		# L1
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	mull	$19,	$1,	$11	# U1
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	ldi	$16,	64($16)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	-16($16)	# L1
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	-8($16)		# L1
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	0($17)		# L1
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	8($17)		# L1
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# L0 lo + acc
+	stl	$22,	-48($16)	# L0
+	stl	$23,	-40($16)	# L1
+	mull	$19,	$3,	$15	# U1
+	addl	$8,	$21,	$8	# U0 hi mul + carry
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	ble	$18,	$Lend		# U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+	.align 4
+$Loop:
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	0($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	8($16)		# L1
+
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	ldl	$2,	16($17)		# L1
+
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	ldl	$3,	24($17)		# L1
+
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	-32($16)	# L0
+	stl	$23,	-24($16)	# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$1,	$11	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$18,	-1($18)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	16($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	24($16)		# L1
+
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	32($17)		# L1
+
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	40($17)		# L1
+
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# U0 lo + acc
+	stl	$22,	-16($16)	# L0
+	stl	$23,	-8($16)		# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$3,	$15	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$8,	$21,	$8	# L0 hi mul + carry
+
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$17,	64($17)		# L1 bookkeeping
+	addl	$4,	$8,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	32($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	40($16)		# L1
+
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	ldl	$2,	-16($17)	# L1
+
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	ldl	$3,	-8($17)		# L1
+
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	0($16)		# L0
+	stl	$23,	8($16)		# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$1,	$11	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$16,	64($16)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	-16($16)	# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	-8($16)		# L1
+
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	0($17)		# L1
+
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	8($17)		# L1
+
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# L0 lo + acc
+	stl	$22,	-48($16)	# L0
+	stl	$23,	-40($16)	# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$3,	$15	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$8,	$21,	$8	# U0 hi mul + carry
+
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	bis	$31,	$31,	$31	# L1 mt
+	bgt	$18,	$Loop		# U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	0($16)		# L1
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	8($16)		# L1
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	-32($16)	# L0
+	stl	$23,	-24($16)	# L1
+	mull	$19,	$1,	$11	# U1
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	addl	$6,	$12,	$22	# U0 hi add => answer
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	addl	$4,	$9,	$4	# U0 lo + acc
+	stl	$22,	-16($16)	# L0
+	stl	$23,	-8($16)		# L1
+	bis	$31,	$31,	$31	# L0 st slosh
+	addl	$8,	$21,	$8	# L0 hi mul + carry
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	stl	$22,	0($16)		# L0
+	stl	$23,	8($16)		# L1
+	addl	$12,	$21,	$0	# U0 hi mul + carry
+
+	ldl	$9,	8($30)
+	ldl	$10,	16($30)
+	ldl	$11,	24($30)
+	ldl	$12,	32($30)
+	ldl	$13,	40($30)
+	ldl	$14,	48($30)
+	ldl	$15,	56($30)
+	ldi	$30,	240($30)
+	ret	$31,	($26),	1
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/sw_64/sw6b/lshift.S b/sysdeps/sw_64/sw6b/lshift.S
new file mode 100644
index 00000000..cc00593c
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64  __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addl	$18,$17,$17	# make r17 point at end of s1
+	ldl	$4,-8($17)	# load first limb
+	subl	$31,$19,$20
+	s8addl	$18,$16,$16	# make r16 point at end of RES
+	subl	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	srl	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subl	$18,$28,$18
+
+	.align	3
+.Loop0:	ldl	$3,-16($17)
+	subl	$16,8,$16
+	sll	$4,$19,$5
+	subl	$17,8,$17
+	subl	$28,1,$28
+	srl	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stl	$8,0($16)
+	bne	$28,.Loop0
+
+.L0:	sll	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldl	$1,-16($17)
+	subl	$18,4,$18
+	ldl	$2,-24($17)
+	ldl	$3,-32($17)
+	ldl	$4,-40($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	ldl	$1,-48($17)
+	sll	$2,$19,$22
+	ldl	$2,-56($17)
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	ldl	$3,-64($17)
+	sll	$4,$19,$24
+	ldl	$4,-72($17)
+	subl	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+
+	srl	$1,$20,$7
+	subl	$18,4,$18
+	sll	$1,$19,$21
+	unop	# ldl	$31,-96($17)
+
+	srl	$2,$20,$8
+	ldl	$1,-80($17)
+	sll	$2,$19,$22
+	ldl	$2,-88($17)
+
+	stl	$5,-24($16)
+	or	$7,$24,$7
+	stl	$6,-32($16)
+	or	$8,$21,$8
+
+	srl	$3,$20,$5
+	unop	# ldl	$31,-96($17)
+	sll	$3,$19,$23
+	subl	$16,32,$16
+
+	srl	$4,$20,$6
+	ldl	$3,-96($17)
+	sll	$4,$19,$24
+	ldl	$4,-104($17)
+
+	subl	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	stl	$5,-24($16)
+	or	$7,$24,$7
+	stl	$6,-32($16)
+	or	$8,$21,$8
+	srl	$3,$20,$5
+	sll	$3,$19,$23
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 2/2
+	stl	$7,-40($16)
+	or	$5,$22,$5
+	stl	$8,-48($16)
+	or	$6,$23,$6
+	stl	$5,-56($16)
+	stl	$6,-64($16)
+ # cool down phase 2/3
+	stl	$24,-72($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 1/2
+	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+	stl	$5,-24($16)
+	stl	$6,-32($16)
+	stl	$24,-40($16)
+	ret	$31,($26),1
+
+.Lend:	stl	$24,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/sysdeps/sw_64/sw6b/memcpy.S b/sysdeps/sw_64/sw6b/memcpy.S
new file mode 100644
index 00000000..938ebdfc
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/memcpy.S
@@ -0,0 +1,416 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   sw6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/*
+ * Much of the information about 21264 scheduling/coding comes from:
+ *	Compiler Writer's Guide for the Sw_64 21264
+ *	abbreviated as 'CWG' in other comments here
+ *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *	E	- either cluster
+ *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ *	$0		- destination address
+ *	$1,$2,		- scratch
+ */
+
+#include <sysdep.h>
+
+	.arch ev6
+	.set noreorder
+	.set noat
+
+	.type $jmppointh,@object
+$jumppointh:
+	.gprel32	$both_0mod8
+	.gprel32	J$H01
+	.gprel32	J$H02
+	.gprel32	J$H03
+	.gprel32	J$H04
+	.gprel32	J$H05
+	.gprel32	J$H06
+	.gprel32	J$H07
+
+ENTRY(memcpy)
+	.prologue 1
+	ldgp	$29, 0($27)
+	mov	$16, $0			# E : copy dest to return
+	ble	$18, $nomoredata	# U : done with the copy?
+	cmplt	$18, 8, $1
+	bne 	$1, $less_8
+	xor	$16, $17, $1		# E : are source and dest alignments the same?
+	and	$1, 7, $1		# E : are they the same mod 8?
+
+	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
+	/* source and dest are same mod 8 address */
+	and	$16, 7, $1		# E : Are both 0mod8?
+	beq	$1, $both_0mod8		# U : Yes
+	nop				# E :
+
+	/*
+	 * source and dest are same misalignment.  move a byte at a time
+	 * until a 0mod8 alignment for both is reached.
+	 * At least one byte more to move
+	 */
+
+	ldi   $2, 8
+	subl  $2, $1, $1
+
+$head_align:
+	addl   $16, $1, $16
+	addl   $17, $1, $17
+	subl   $18, $1, $18
+	ldih    $2, $jumppointh($29)    !gprelhigh
+	s4addl $1, $2, $2
+	ldw    $2, $jumppointh($2)      !gprellow
+	addl   $2, $29, $2
+	jmp    ($2)
+
+$both_0mod8:
+	cmple	$18, 127, $1		# E : Can we unroll the loop?
+	bne	$1, $no_unroll		# U :
+	and	$16, 63, $1		# E : get mod64 alignment
+	beq	$1, $do_unroll		# U : no single quads to fiddle
+
+$single_head_quad:
+	ldl	$1, 0($17)		# L : get 8 bytes
+	subl	$18, 8, $18		# E : count -= 8
+	addl	$17, 8, $17		# E : src += 8
+	nop				# E :
+
+	stl	$1, 0($16)		# L : store
+	addl	$16, 8, $16		# E : dest += 8
+	and	$16, 63, $1		# E : get mod64 alignment
+	bne	$1, $single_head_quad	# U : still not fully aligned
+
+$do_unroll:
+	ldih	$1, 8($31)		# big than 512K
+	cmple	$18, $1, $1
+	beq	$1, $unroll_body_512
+	nop
+	nop
+	cmple	$18, 63, $1		# E : Can we go through the unrolled loop?
+	bne	$1, $tail_quads		# U : Nope
+	nop				# E :
+
+$unroll_body:
+	ldl	$6, 0($17)		# L0 : bytes 0..7
+	nop				# E :
+	nop				# E :
+
+	ldl	$4, 8($17)		# L : bytes 8..15
+	ldl	$5, 16($17)		# L : bytes 16..23
+	nop				# E :
+	nop				# E :
+
+	ldl	$3, 24($17)		# L : bytes 24..31
+	addl	$16, 64, $1		# E : fallback value for wh64
+	nop				# E :
+	nop				# E :
+
+	addl	$17, 32, $17		# E : src += 32 bytes
+	stl	$6, 0($16)		# L : bytes 0..7
+	nop				# E :
+	nop				# E :
+
+	stl	$4, 8($16)		# L : bytes 8..15
+	stl	$5, 16($16)		# L : bytes 16..23
+	subl	$18, 192, $2		# E : At least two more trips to go?
+	nop				# E :
+
+	stl	$3, 24($16)		# L : bytes 24..31
+	addl	$16, 32, $16		# E : dest += 32 bytes
+	nop				# E :
+	nop				# E :
+
+	ldl	$6, 0($17)		# L : bytes 0..7
+	ldl	$4, 8($17)		# L : bytes 8..15
+					# fallback wh64 address if < 2 more trips
+	nop				# E :
+	nop				# E :
+
+	ldl	$5, 16($17)		# L : bytes 16..23
+	ldl	$3, 24($17)		# L : bytes 24..31
+	addl	$16, 32, $16		# E : dest += 32
+	subl	$18, 64, $18		# E : count -= 64
+
+	addl	$17, 32, $17		# E : src += 32
+	stl	$6, -32($16)		# L : bytes 0..7
+	stl	$4, -24($16)		# L : bytes 8..15
+	cmple	$18, 63, $1		# E : At least one more trip?
+
+	stl	$5, -16($16)		# L : bytes 16..23
+	stl	$3, -8($16)		# L : bytes 24..31
+	nop				# E :
+	beq	$1, $unroll_body
+	nop
+	nop
+	nop
+	br	$tail_quads
+
+$unroll_body_512:
+	fillcs	128*4($17)
+	e_fillcs 128*20($17)
+
+	fillcs	128*3($16)		#add by ZJ20220620 stl_nc->stl
+	e_fillcs 128*7($16)
+
+	ldl	$6, 0($17)		# L0 : bytes 0..7
+	nop				# E :
+	nop				# E :
+
+	ldl	$4, 8($17)		# L : bytes 8..15
+	ldl	$5, 16($17)		# L : bytes 16..23
+	nop				# E :
+	nop				# E :
+
+	ldl	$3, 24($17)		# L : bytes 24..31
+	addl	$16, 64, $1		# E : fallback value for wh64
+	nop				# E :
+	nop				# E :
+
+	addl	$17, 32, $17		# E : src += 32 bytes
+	stl	$6, 0($16)		# L : bytes 0..7
+	nop				# E :
+	nop				# E :
+
+	stl	$4, 8($16)		# L : bytes 8..15
+	stl	$5, 16($16)		# L : bytes 16..23
+	subl	$18, 192, $2		# E : At least two more trips to go?
+	nop				# E :
+
+	stl	$3, 24($16)		# L : bytes 24..31
+	addl	$16, 32, $16		# E : dest += 32 bytes
+	nop				# E :
+	nop				# E :
+
+	ldl	$6, 0($17)		# L : bytes 0..7
+	ldl	$4, 8($17)		# L : bytes 8..15
+					# fallback wh64 address if < 2 more trips
+	nop				# E :
+	nop				# E :
+
+	ldl	$5, 16($17)		# L : bytes 16..23
+	ldl	$3, 24($17)		# L : bytes 24..31
+	addl	$16, 32, $16		# E : dest += 32
+	subl	$18, 64, $18		# E : count -= 64
+
+	addl	$17, 32, $17		# E : src += 32
+	stl	$6, -32($16)		# L : bytes 0..7
+	stl	$4, -24($16)		# L : bytes 8..15
+	cmple	$18, 63, $1		# E : At least one more trip?
+
+	stl	$5, -16($16)		# L : bytes 16..23
+	stl	$3, -8($16)		# L : bytes 24..31
+	nop				# E :
+	beq	$1, $unroll_body_512
+
+$tail_quads:
+$no_unroll:
+	.align 4
+	subl	$18, 8, $18		# E : At least a quad left?
+	blt	$18, $less_than_8	# U : Nope
+	nop				# E :
+	nop				# E :
+
+$move_a_quad:
+	ldl	$1, 0($17)		# L : fetch 8
+	subl	$18, 8, $18		# E : count -= 8
+	addl	$17, 8, $17		# E : src += 8
+	nop				# E :
+
+	stl	$1, 0($16)		# L : store 8
+	addl	$16, 8, $16		# E : dest += 8
+	bge	$18, $move_a_quad	# U :
+	nop				# E :
+
+$less_than_8:
+	.align 4
+	addl	$18, 8, $18		# E : add back for trailing bytes
+	ble	$18, $nomoredata	# U : All-done
+	nop				# E :
+	nop				# E :
+
+	/* Trailing bytes */
+$tail_bytes:
+	subl	$18, 1, $18		# E : count--
+	ldbu	$1, 0($17)		# L : fetch a byte
+	addl	$17, 1, $17		# E : src++
+	nop				# E :
+
+	stb	$1, 0($16)		# L : store a byte
+	addl	$16, 1, $16		# E : dest++
+	bgt	$18, $tail_bytes	# U : more to be done?
+	nop				# E :
+
+	/* branching to exit takes 3 extra cycles, so replicate exit here */
+	ret	$31, ($26), 1		# L0 :
+	nop				# E :
+	nop				# E :
+	nop				# E :
+
+$misaligned:
+	mov	$0, $4			# E : dest temp
+	and	$0, 7, $1		# E : dest alignment mod8
+	beq	$1, $dest_0mod8		# U : life doesnt totally suck
+	nop
+
+$aligndest:
+	ble	$18, $nomoredata	# U :
+	ldbu	$1, 0($17)		# L : fetch a byte
+	subl	$18, 1, $18		# E : count--
+	addl	$17, 1, $17		# E : src++
+
+	stb	$1, 0($4)		# L : store it
+	addl	$4, 1, $4		# E : dest++
+	and	$4, 7, $1		# E : dest 0mod8 yet?
+	bne	$1, $aligndest		# U : go until we are aligned.
+
+	/* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+	subl	$18, 8, $18		# E : At least a quad left?
+	blt	$18, $misalign_tail	# U : Nope
+	ldl_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
+	ldih    $1, 8($31)
+	subl	$1, 8, $1
+	cmple	$18, $1, $1
+	beq	$1, $mis_quad_big	# big than 512K
+
+$mis_quad:
+	ldl_u	$16, 8($17)	# L : Fetch next 8
+	ext3b	$3, $17, $3	# U : masking
+	ext7b	$16, $17, $1	# U : masking
+	bis	$3, $1, $1	# E : merged bytes to store
+
+	subl	$18, 8, $18	# E : count -= 8
+	addl	$17, 8, $17	# E : src += 8
+	stl	$1, 0($4)	# L : store 8 (aligned)
+	mov	$16, $3		# E : "rotate" source data
+
+	addl	$4, 8, $4	# E : dest += 8
+	bge	$18, $mis_quad	# U : More quads to move
+	nop
+	nop
+	nop
+	br	$misalign_tail
+
+$mis_quad_big:
+	fillcs	128*4($17)
+	e_fillcs 128*20($17)
+	ldl_u   $16, 8($17)	     # L : Fetch next 8
+	ext3b   $3, $17, $3	     # U : masking
+	ext7b   $16, $17, $1	    # U : masking
+	bis     $3, $1, $1	      # E : merged bytes to store
+
+	fillcs	128*9($17)		#add by ZJ20220620 stl_nc->stl
+	e_fillcs 128*15($17)
+
+	subl	$18, 8, $18	     # E : count -= 8
+	addl	$17, 8, $17	     # E : src += 8
+	stl	$1, 0($4)	       # L : store 8 (aligned)
+	mov     $16, $3		 # E : "rotate" source data
+
+	addl    $4, 8, $4	       # E : dest += 8
+	bge     $18, $mis_quad_big	  # U : More quads to move
+	nop
+	nop
+
+$misalign_tail:
+	addl	$18, 8, $18		# E : account for tail stuff
+	ble	$18, $nomoredata	# U :
+	nop
+	nop
+
+$misalign_byte:
+	ldbu	$1, 0($17)		# L : fetch 1
+	subl	$18, 1, $18		# E : count--
+	addl	$17, 1, $17		# E : src++
+	nop				# E :
+
+	stb	$1, 0($4)		# L : store
+	addl	$4, 1, $4		# E : dest++
+	bgt	$18, $misalign_byte	# U : more to go?
+	nop
+	br 	$nomoredata
+
+$less_8:
+	ldbu	$1, 0($17)		# L : fetch 1
+	subl	$18, 1, $18		# E : count--
+	addl	$17, 1, $17		# E : src++
+	nop				# E :
+
+	stb	$1, 0($16)		# L : store
+	addl	$16, 1, $16		# E : dest++
+	bgt	$18, $less_8		# U : more to go?
+	nop
+
+$nomoredata:
+	ret	$31, ($26), 1		# L0 :
+	nop				# E :
+	nop				# E :
+	nop				# E :
+
+J$H01:
+	ldbu    $1,-1($17)
+	stb     $1,-1($16)
+	br      $both_0mod8
+
+J$H02:
+	ldh     $1,-2($17)
+	sth     $1,-2($16)
+	br      $both_0mod8
+
+J$H03:
+	ldh     $1,-2($17)
+	ldbu    $2,-3($17)
+	sth     $1,-2($16)
+	stb     $2,-3($16)
+	br      $both_0mod8
+
+J$H04:
+	ldw     $1,-4($17)
+	stw     $1,-4($16)
+	br      $both_0mod8
+
+J$H05:
+	ldw     $1,-4($17)
+	ldbu    $2,-5($17)
+	stw     $1,-4($16)
+	stb     $2,-5($16)
+	br      $both_0mod8
+
+J$H06:
+	ldw     $1,-4($17)
+	ldh     $2,-6($17)
+	stw     $1,-4($16)
+	sth     $2,-6($16)
+	br      $both_0mod8
+
+J$H07:
+	ldw     $1,-4($17)
+	ldh     $2,-6($17)
+	ldbu    $3,-7($17)
+	stw     $1,-4($16)
+	sth     $2,-6($16)
+	stb     $3,-7($16)
+	br      $both_0mod8
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/sw_64/sw6b/memset.S b/sysdeps/sw_64/sw6b/memset.S
new file mode 100644
index 00000000..0085ac70
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/memset.S
@@ -0,0 +1,312 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+   SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+	.arch sw6b
+	.set noat
+	.set noreorder
+
+ENTRY(memset)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+	/*
+	 * Serious stalling happens.  The only way to mitigate this is to
+	 * undertake a major re-write to interleave the constant materialization
+	 * with other parts of the fall-through code.  This is important, even
+	 * though it makes maintenance tougher.
+	 * Do this later.
+	 */
+	and	$17, 255, $1	# E : 00000000000000ch
+	ins0b	$17, 1, $2	# U : 000000000000ch00
+	mov	$16, $0		# E : return value
+        mov     $17, $8		# E : Save the ch
+	ble	$18, $end	# U : zero length requested?
+
+	addl	$18, $16, $6	# E : max address to write to
+	or	$1, $2, $17	# E : 000000000000chch
+	ins0b	$1, 2, $3	# U : 0000000000ch0000
+	ins0b	$1, 3, $4	# U : 00000000ch000000
+
+	or	$3, $4, $3	# E : 00000000chch0000
+	ins1b	$17, 4, $5	# U : 0000chch00000000
+	xor	$16, $6, $1	# E : will complete write be within one quadword?
+	ins1b	$17, 6, $2	# U : chch000000000000
+
+	or	$17, $3, $17	# E : 00000000chchchch
+	or	$2, $5, $2	# E : chchchch00000000
+	bic	$1, 7, $1	# E : fit within a single quadword?
+	and	$16, 7, $3	# E : Target addr misalignment
+
+	or	$17, $2, $17	# E : chchchchchchchch
+	beq	$1, $within_quad # U :
+	nop			# E :
+	beq	$3, $aligned	# U : target is 0mod8
+
+	/*
+	 * Target address is misaligned, and won't fit within a quadword.
+	 */
+
+#ifdef  pixman_error
+	/* if the addr is unaligned in multi-thread, this will cause thread
+	   unsafty,so use stb to store the trailing bytes.  */
+	ldl_u	$4, 0($16)	# L : Fetch first partial
+	mov	$16, $5		# E : Save the address
+	ins3b	$17, $16, $2	# U : Insert new bytes
+	subl	$3, 8, $3	# E : Invert (for addressing uses)
+
+	addl	$18, $3, $18	# E : $18 is new count ($3 is negative)
+	mask3b	$4, $16, $4	# U : clear relevant parts of the quad
+	subl	$16, $3, $16	# E : $16 is new aligned destination
+	or	$2, $4, $1	# E : Final bytes
+
+	nop
+	stl_u	$1,0($5)	# L : Store result
+	nop
+	nop
+#else
+$misaligned:
+        stb	$8, 0($16)
+        subl	$18, 1, $18
+        beq	$18, $end
+        addl	$16, 1, $16
+        and     $16, 7, $3      # E : Target addr misalignment
+        bne	$3, $misaligned
+#endif
+
+	.align 4
+$aligned:
+	/*
+	 * We are now guaranteed to be quad aligned, with at least
+	 * one partial quad to write.
+	 */
+
+	sra	$18, 3, $3	# U : Number of remaining quads to write
+	and	$18, 7, $18	# E : Number of trailing bytes to write
+	mov	$16, $5		# E : Save dest address
+	beq	$3, $no_quad	# U : tail stuff only
+
+	/*
+	 * It's worth the effort to unroll this and use wh64 if possible.
+	 * At this point, entry values are:
+	 * $16	Current destination address
+	 * $5	A copy of $16
+	 * $6	The max quadword address to write to
+	 * $18	Number trailer bytes
+	 * $3	Number quads to write
+	 */
+#	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
+	and	$16, 0x1f, $2	# E : Forward work (only useful for unrolled loop)
+	subl	$3, 16, $4	# E : Only try to unroll if > 128 bytes
+	subl	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
+	blt	$4, $loop	# U :
+
+	/*
+	 * We know we've got at least 16 quads, minimum of one trip
+	 * through unrolled loop.  Do a quad at a time to get us 0mod64
+	 * aligned.
+	 */
+
+	nop			# E :
+	nop			# E :
+	nop			# E :
+#	beq	$1, $bigalign	# U :
+	beq	$2, $bigalign	# U :
+$alignmod32:
+	stl	$17, 0($5)	# L :
+	subl	$3, 1, $3	# E : For consistency later
+	addl	$1, 8, $1	# E : Increment towards zero for alignment
+#	addl	$5, 8, $4	# E : Initial wh64 address (filler instruction)
+
+	nop
+	nop
+	addl	$5, 8, $5	# E : Inc address
+	blt	$1, $alignmod32 # U :
+
+
+$bigalign:
+	ldih    $1, 8($31)	# big than 512KB
+	cmple	$18, $1, $1
+	beq	$1, $do_wh64_512
+
+	/*
+	 * $3 - number quads left to go
+	 * $5 - target address (aligned 0mod64)
+	 * $17 - mask of stuff to store
+	 * Scratch registers available: $7, $2, $4, $1
+	 * We know that we'll be taking a minimum of one trip through.
+	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+	 * Assumes the wh64 needs to be for 2 trips through the loop in the
+	 * future.The wh64 is issued on for the starting destination address for
+	 * trip +2 through the loop, and if there are less than two trips left,
+	 * the target address will be for the current trip.  */
+
+$do_wh64:
+#	wh64	($4)		# L1 : memory subsystem write hint
+	subl	$3, 24, $2	# E : For determining future wh64 addresses
+	stl	$17, 0($5)	# L :
+	nop			# E :
+
+#	addl	$5, 128, $4	# E : speculative target of next wh64
+	stl	$17, 8($5)	# L :
+	stl	$17, 16($5)	# L :
+	addl	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
+
+	stl	$17, 24($5)	# L :
+	stl	$17, 32($5)	# L :
+#	sellt	$2, $7, $4, $4	# E : Latency 2, extra mapping cycle
+	nop
+
+	stl	$17, 40($5)	# L :
+	stl	$17, 48($5)	# L :
+	subl	$3, 16, $2	# E : Repeat the loop at least once more?
+	nop
+
+	stl	$17, 56($5)	# L :
+	addl	$5, 64, $5	# E :
+	subl	$3, 8, $3	# E :
+	bge	$2, $do_wh64	# U :
+
+	nop
+	nop
+	nop
+	beq	$3, $no_quad	# U : Might have finished already
+
+	nop
+	nop
+	nop
+	br	$loop		# U : Might have finished already
+
+$do_wh64_512:
+#	wh64	($4)		# L1 : memory subsystem write hint
+	subl	$3, 24, $2	# E : For determining future wh64 addresses
+
+	fillcs	 128*1($5)
+	e_fillcs 128*5($5)
+
+#	stl_nc	$17, 0($5)	# L :
+	stl	$17, 0($5)	# L :
+	nop			# E :
+
+#	addl	$5, 128, $4	# E : speculative target of next wh64
+#	stl_nc	$17, 8($5)	# L :
+	stl	$17, 8($5)	# L :
+#	stl_nc	$17, 16($5)	# L :
+	stl	$17, 16($5)	# L :
+	addl	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
+
+#	stl_nc	$17, 24($5)	# L :
+	stl	$17, 24($5)	# L :
+#	stl_nc	$17, 32($5)	# L :
+	stl	$17, 32($5)	# L :
+#	sellt	$2, $7, $4, $4	# E : Latency 2, extra mapping cycle
+	nop
+
+#	stl_nc	$17, 40($5)	# L :
+	stl	$17, 40($5)	# L :
+#	stl_nc	$17, 48($5)	# L :
+	stl	$17, 48($5)	# L :
+	subl	$3, 16, $2	# E : Repeat the loop at least once more?
+	nop
+
+#	stl_nc	$17, 56($5)	# L :
+	stl	$17, 56($5)	# L :
+	addl	$5, 64, $5	# E :
+	subl	$3, 8, $3	# E :
+	bge	$2, $do_wh64_512	# U :
+
+	nop
+	nop
+	nop
+	beq	$3, $no_quad	# U : Might have finished already
+
+	.align 4
+	/*
+	 * Simple loop for trailing quadwords, or for small amounts
+	 * of data (where we can't use an unrolled loop and wh64)
+	 */
+$loop:
+	stl	$17, 0($5)	# L :
+	subl	$3, 1, $3	# E : Decrement number quads left
+	addl	$5, 8, $5	# E : Inc address
+	bne	$3, $loop	# U : more?
+
+$no_quad:
+	/*
+	 * Write 0..7 trailing bytes.
+	 */
+	nop			# E :
+	beq	$18, $end	# U : All done?
+
+#ifndef  pixman_error
+/* if the addr is unaligned in multi-thread, this will cause thread unsafty,
+   so use stb to store the trailing bytes.  */
+$trailing:
+        stb	$17, 0($5)
+        subl	$18, 1, $18
+        beq	$18, $end
+        addl	$5, 1, $5
+        br	$trailing
+#else
+	ldl	$7, 0($5)	# L :
+	mask7b	$7, $6, $2	# U : Mask final quad
+
+	ins7b	$17, $6, $4	# U : New bits
+	or	$2, $4, $1	# E : Put it all together
+	stl	$1, 0($5)	# L : And back to memory
+	ret	$31,($26),1	# L0 :
+#endif
+
+$within_quad:
+#ifdef PIXMAN_ERROR
+	/* if the addr is unaligned in multi-thread, this will cause thread
+	   unsafty,so use stb to store the trailing bytes.  */
+	ldl_u	$1, 0($16)	# L :
+	ins3b	$17, $16, $2	# U : New bits
+	mask3b	$1, $16, $4	# U : Clear old
+	or	$2, $4, $2	# E : New result
+
+	mask3b	$2, $6, $4	# U :
+	mask7b	$1, $6, $2	# U :
+	or	$2, $4, $1	# E :
+	stl_u	$1, 0($16)	# L :
+#else
+        stb	$8, 0($16)
+        subl	$18, 1, $18
+        beq	$18, $end
+        addl	$16, 1, $16
+        br	$within_quad
+#endif
+
+$end:
+	nop
+	nop
+	nop
+	ret $31,($26),1		# L0 :
+
+        END(memset)
+libc_hidden_builtin_def (memset)
diff --git a/sysdeps/sw_64/sw6b/rshift.S b/sysdeps/sw_64/sw6b/rshift.S
new file mode 100644
index 00000000..ec2a78b0
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64  __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldl	$4,0($17)	# load first limb
+	subl	$31,$19,$20
+	subl	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	sll	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subl	$18,$28,$18
+
+	.align	3
+.Loop0:	ldl	$3,8($17)
+	addl	$16,8,$16
+	srl	$4,$19,$5
+	addl	$17,8,$17
+	subl	$28,1,$28
+	sll	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stl	$8,-8($16)
+	bne	$28,.Loop0
+
+.L0:	srl	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldl	$1,8($17)
+	subl	$18,4,$18
+	ldl	$2,16($17)
+	ldl	$3,24($17)
+	ldl	$4,32($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	ldl	$1,40($17)
+	srl	$2,$19,$22
+	ldl	$2,48($17)
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	ldl	$3,56($17)
+	srl	$4,$19,$24
+	ldl	$4,64($17)
+	subl	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+
+	sll	$1,$20,$7
+	subl	$18,4,$18
+	srl	$1,$19,$21
+	unop	# ldl	$31,-96($17)
+
+	sll	$2,$20,$8
+	ldl	$1,72($17)
+	srl	$2,$19,$22
+	ldl	$2,80($17)
+
+	stl	$5,16($16)
+	or	$7,$24,$7
+	stl	$6,24($16)
+	or	$8,$21,$8
+
+	sll	$3,$20,$5
+	unop	# ldl	$31,-96($17)
+	srl	$3,$19,$23
+	addl	$16,32,$16
+
+	sll	$4,$20,$6
+	ldl	$3,88($17)
+	srl	$4,$19,$24
+	ldl	$4,96($17)
+
+	addl	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	stl	$5,16($16)
+	or	$7,$24,$7
+	stl	$6,24($16)
+	or	$8,$21,$8
+	sll	$3,$20,$5
+	srl	$3,$19,$23
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 2/2
+	stl	$7,32($16)
+	or	$5,$22,$5
+	stl	$8,40($16)
+	or	$6,$23,$6
+	stl	$5,48($16)
+	stl	$6,56($16)
+ # cool down phase 2/3
+	stl	$24,64($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 1/2
+	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+	stl	$5,16($16)
+	stl	$6,24($16)
+	stl	$24,32($16)
+	ret	$31,($26),1
+
+.Lend:	stl	$24,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/sysdeps/sw_64/sw6b/stxcpy.S b/sysdeps/sw_64/sw6b/stxcpy.S
new file mode 100644
index 00000000..cf07eb8e
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/stxcpy.S
@@ -0,0 +1,314 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+   SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Copy a null-terminated string from SRC to DST.
+
+   This is an internal routine used by strcpy, stpcpy, and strcat.
+   As such, it uses special linkage conventions to make implementation
+   of these public functions more efficient.
+
+   On input:
+	t9 = return address
+	a0 = DST
+	a1 = SRC
+
+   On output:
+	t8  = bitmask (with one bit set) indicating the last byte written
+	a0  = unaligned address of the last *word* written
+
+   Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+
+#include <sysdep.h>
+
+	.arch ev6
+	.set noat
+	.set noreorder
+
+	.text
+	.type	__stxcpy, @function
+	.globl	__stxcpy
+	.usepv	__stxcpy, no
+
+	cfi_startproc
+	cfi_return_column (t9)
+
+	/* On entry to this basic block:
+	   t0 == the first destination word for masking back in
+	   t1 == the first source word.  */
+	.align 4
+stxcpy_aligned:
+	/* Create the 1st output word and detect 0's in the 1st input word.  */
+	ldi	t2, -1		# E : build a mask against false zero
+	mask7b	t2, a1, t2	# U :   detection in the src word (stall)
+	mask7b	t1, a1, t3	# U :
+	ornot	t1, t2, t2	# E : (stall)
+
+	mask3b	t0, a1, t0	# U : assemble the first output word
+	cmpgeb	zero, t2, t10	# E : bits set iff null found
+	or	t0, t3, t1	# E : (stall)
+	bne	t10, $a_eos	# U : (stall)
+
+	/* On entry to this basic block:
+	   t0 == the first destination word for masking back in
+	   t1 == a source word not containing a null.  */
+	/* Nops here to separate store quads from load quads */
+
+$a_loop:
+	stl_u	t1, 0(a0)	# L :
+	addl	a0, 8, a0	# E :
+	nop
+	nop
+
+	ldl_u	t1, 0(a1)	# L : Latency=3
+	addl	a1, 8, a1	# E :
+	cmpgeb	zero, t1, t10	# E : (3 cycle stall)
+	beq	t10, $a_loop	# U : (stall for t10)
+
+	/* Take care of the final (partial) word store.
+	   On entry to this basic block we have:
+	   t1 == the source word containing the null
+	   t10 == the cmpgeb mask that found it.  */
+$a_eos:
+	negl	t10, t6		# E : find low bit set
+	and	t10, t6, t8	# E : (stall)
+	/* For the sake of the cache, don't read a destination word
+	   if we're not going to need it.  */
+	and	t8, 0x80, t6	# E : (stall)
+	bne	t6, 1f		# U : (stall)
+
+	/* We're doing a partial word store and so need to combine
+	   our source and original destination words.  */
+	ldl_u	t0, 0(a0)	# L : Latency=3
+	subl	t8, 1, t6	# E :
+	zapnot	t1, t6, t1	# U : clear src bytes >= null (stall)
+	or	t8, t6, t10	# E : (stall)
+
+	zap	t0, t10, t0	# E : clear dst bytes <= null
+	or	t0, t1, t1	# E : (stall)
+	nop
+	nop
+
+1:	stl_u	t1, 0(a0)	# L :
+	ret	(t9)		# L0 : Latency=3
+	nop
+	nop
+
+	.align 4
+__stxcpy:
+	/* Are source and destination co-aligned?  */
+	xor	a0, a1, t0	# E :
+	unop			# E :
+	and	t0, 7, t0	# E : (stall)
+	bne	t0, $unaligned	# U : (stall)
+
+	/* We are co-aligned; take care of a partial first word.  */
+	ldl_u	t1, 0(a1)		# L : load first src word
+	and	a0, 7, t0		# E : take care not to load a word ...
+	addl	a1, 8, a1		# E :
+	beq	t0, stxcpy_aligned	# U : ... if we wont need it (stall)
+
+	ldl_u	t0, 0(a0)	# L :
+	br	stxcpy_aligned	# L0 : Latency=3
+	nop
+	nop
+
+
+/* The source and destination are not co-aligned.  Align the destination
+   and cope.  We have to be very careful about not reading too much and
+   causing a SEGV.  */
+
+	.align 4
+$u_head:
+	/* We know just enough now to be able to assemble the first
+	   full source word.  We can still find a zero at the end of it
+	   that prevents us from outputting the whole thing.
+
+	   On entry to this basic block:
+	   t0 == the first dest word, for masking back in, if needed else 0
+	   t1 == the low bits of the first source word
+	   t6 == bytemask that is -1 in dest word bytes */
+
+	ldl_u	t2, 8(a1)	# L :
+	addl	a1, 8, a1	# E :
+	ext3b	t1, a1, t1	# U : (stall on a1)
+	ext7b	t2, a1, t4	# U : (stall on a1)
+
+	mask3b	t0, a0, t0	# U :
+	or	t1, t4, t1	# E :
+	mask7b	t1, a0, t1	# U : (stall on t1)
+	or	t0, t1, t1	# E : (stall on t1)
+
+	or	t1, t6, t6	# E :
+	cmpgeb	zero, t6, t10	# E : (stall)
+	ldi	t6, -1		# E : for masking just below
+	bne	t10, $u_final	# U : (stall)
+
+	mask3b	t6, a1, t6		# U : mask out the bits we have
+	or	t6, t2, t2		# E :   already extracted before (stall)
+	cmpgeb	zero, t2, t10		# E :   testing eos (stall)
+	bne	t10, $u_late_head_exit	# U : (stall)
+
+	/* Finally, we've got all the stupid leading edge cases taken care
+	   of and we can set up to enter the main loop.  */
+
+	stl_u	t1, 0(a0)	# L : store first output word
+	addl	a0, 8, a0	# E :
+	ext3b	t2, a1, t0	# U : position ho-bits of lo word
+	ldl_u	t2, 8(a1)	# U : read next high-order source word
+
+	addl	a1, 8, a1	# E :
+	cmpgeb	zero, t2, t10	# E : (stall for t2)
+	nop			# E :
+	bne	t10, $u_eos	# U : (stall)
+
+	/* Unaligned copy main loop.  In order to avoid reading too much,
+	   the loop is structured to detect zeros in aligned source words.
+	   This has, unfortunately, effectively pulled half of a loop
+	   iteration out into the head and half into the tail, but it does
+	   prevent nastiness from accumulating in the very thing we want
+	   to run as fast as possible.
+
+	   On entry to this basic block:
+	   t0 == the shifted high-order bits from the previous source word
+	   t2 == the unshifted current source word
+
+	   We further know that t2 does not contain a null terminator.  */
+
+	.align 3
+$u_loop:
+	ext7b	t2, a1, t1	# U : extract high bits for current word
+	addl	a1, 8, a1	# E : (stall)
+	ext3b	t2, a1, t3	# U : extract low bits for next time (stall)
+	addl	a0, 8, a0	# E :
+
+	or	t0, t1, t1	# E : current dst word now complete
+	ldl_u	t2, 0(a1)	# L : Latency=3 load high word for next time
+	stl_u	t1, -8(a0)	# L : save the current word (stall)
+	mov	t3, t0		# E :
+
+	cmpgeb	zero, t2, t10	# E : test new word for eos
+	beq	t10, $u_loop	# U : (stall)
+	nop
+	nop
+
+	/* We've found a zero somewhere in the source word we just read.
+	   If it resides in the lower half, we have one (probably partial)
+	   word to write out, and if it resides in the upper half, we
+	   have one full and one partial word left to write out.
+
+	   On entry to this basic block:
+	   t0 == the shifted high-order bits from the previous source word
+	   t2 == the unshifted current source word.  */
+$u_eos:
+	ext7b	t2, a1, t1	# U :
+	or	t0, t1, t1	# E : first (partial) source word complete (stall)
+	cmpgeb	zero, t1, t10	# E : is the null in this first bit? (stall)
+	bne	t10, $u_final	# U : (stall)
+
+$u_late_head_exit:
+	stl_u	t1, 0(a0)	# L : the null was in the high-order bits
+	addl	a0, 8, a0	# E :
+	ext3b	t2, a1, t1	# U :
+	cmpgeb	zero, t1, t10	# E : (stall)
+
+	/* Take care of a final (probably partial) result word.
+	   On entry to this basic block:
+	   t1 == assembled source word
+	   t10 == cmpgeb mask that found the null.  */
+$u_final:
+	negl	t10, t6		# E : isolate low bit set
+	and	t6, t10, t8	# E : (stall)
+	and	t8, 0x80, t6	# E : avoid dest word load if we can (stall)
+	bne	t6, 1f		# U : (stall)
+
+	ldl_u	t0, 0(a0)	# E :
+	subl	t8, 1, t6	# E :
+	or	t6, t8, t10	# E : (stall)
+	zapnot	t1, t6, t1	# U : kill source bytes >= null (stall)
+
+	zap	t0, t10, t0	# U : kill dest bytes <= null (2 cycle data stall)
+	or	t0, t1, t1	# E : (stall)
+	nop
+	nop
+
+1:	stl_u	t1, 0(a0)	# L :
+	ret	(t9)		# L0 : Latency=3
+	nop
+	nop
+
+	/* Unaligned copy entry point.  */
+	.align 4
+$unaligned:
+
+	ldl_u	t1, 0(a1)	# L : load first source word
+	and	a0, 7, t4	# E : find dest misalignment
+	and	a1, 7, t5	# E : find src misalignment
+	/* Conditionally load the first destination word and a bytemask
+	   with 0xff indicating that the destination byte is sacrosanct.  */
+	mov	zero, t0	# E :
+
+	mov	zero, t6	# E :
+	beq	t4, 1f		# U :
+	ldl_u	t0, 0(a0)	# L :
+	ldi	t6, -1		# E :
+
+	mask3b	t6, a0, t6	# U :
+	nop
+	nop
+	nop
+1:
+	subl	a1, t4, a1	# E : sub dest misalignment from src addr
+	/* If source misalignment is larger than dest misalignment, we need
+	   extra startup checks to avoid SEGV.  */
+	cmplt	t4, t5, t8	# E :
+	beq	t8, $u_head	# U :
+	ldi	t2, -1		# E : mask out leading garbage in source
+
+	mask7b	t2, t5, t2	# U :
+	ornot	t1, t2, t3	# E : (stall)
+	cmpgeb	zero, t3, t10	# E : is there a zero? (stall)
+	beq	t10, $u_head	# U : (stall)
+
+	/* At this point we've found a zero in the first partial word of
+	   the source.  We need to isolate the valid source data and mask
+	   it into the original destination data.  (Incidentally, we know
+	   that we'll need at least one byte of that original dest word.) */
+
+	ldl_u	t0, 0(a0)	# L :
+	negl	t10, t6		# E : build bitmask of bytes <= zero
+	and	t6, t10, t8	# E : (stall)
+	and	a1, 7, t5	# E :
+
+	subl	t8, 1, t6	# E :
+	or	t6, t8, t10	# E : (stall)
+	srl	t8, t5, t8	# U : adjust final null return value
+	zapnot	t2, t10, t2	# U : prepare source word; mirror changes (stall)
+
+	and	t1, t2, t1	# E : to source validity mask
+	ext3b	t2, a1, t2	# U :
+	ext3b	t1, a1, t1	# U : (stall)
+	andnot	t0, t2, t0	# .. e1 : zero place for source to reside (stall)
+
+	or	t0, t1, t1	# e1    : and put it there
+	stl_u	t1, 0(a0)	# .. e0 : (stall)
+	ret	(t9)		# e1    :
+
+	cfi_endproc
diff --git a/sysdeps/sw_64/sw6b/stxncpy.S b/sysdeps/sw_64/sw6b/stxncpy.S
new file mode 100644
index 00000000..c47029ea
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/stxncpy.S
@@ -0,0 +1,392 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+   Contributed by Richard Henderson (rth@tamu.edu)
+   SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Copy no more than COUNT bytes of the null-terminated string from
+   SRC to DST.
+
+   This is an internal routine used by strncpy, stpncpy, and strncat.
+   As such, it uses special linkage conventions to make implementation
+   of these public functions more efficient.
+
+   On input:
+	t9 = return address
+	a0 = DST
+	a1 = SRC
+	a2 = COUNT
+
+   Furthermore, COUNT may not be zero.
+
+   On output:
+	t0  = last word written
+	t8  = bitmask (with one bit set) indicating the last byte written
+	t10 = bitmask (with one bit set) indicating the byte position of
+	      the end of the range specified by COUNT
+	a0  = unaligned address of the last *word* written
+	a2  = the number of full words left in COUNT
+
+   Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+#include <sysdep.h>
+
+	.arch ev6
+	.set noat
+	.set noreorder
+
+	.text
+	.type	__stxncpy, @function
+	.globl	__stxncpy
+	.usepv	__stxncpy, no
+
+	cfi_startproc
+	cfi_return_column (t9)
+
+	/* On entry to this basic block:
+	   t0 == the first destination word for masking back in
+	   t1 == the first source word.  */
+	.align 4
+stxncpy_aligned:
+	/* Create the 1st output word and detect 0's in the 1st input word.  */
+	ldi	t2, -1		# E : build a mask against false zero
+	mask7b	t2, a1, t2	# U :   detection in the src word (stall)
+	mask7b	t1, a1, t3	# U :
+	ornot	t1, t2, t2	# E : (stall)
+
+	mask3b	t0, a1, t0	# U : assemble the first output word
+	cmpgeb	zero, t2, t7	# E : bits set iff null found
+	or	t0, t3, t0	# E : (stall)
+	beq	a2, $a_eoc	# U :
+
+	bne	t7, $a_eos	# U :
+	nop
+	nop
+	nop
+
+	/* On entry to this basic block:
+	   t0 == a source word not containing a null.  */
+
+	/*
+	 * nops here to:
+	 *	separate store quads from load quads
+	 *	limit of 1 bcond/quad to permit training
+	 */
+$a_loop:
+	stl_u	t0, 0(a0)	# L :
+	addl	a0, 8, a0	# E :
+	subl	a2, 1, a2	# E :
+	nop
+
+	ldl_u	t0, 0(a1)	# L :
+	addl	a1, 8, a1	# E :
+	cmpgeb	zero, t0, t7	# E :
+	beq	a2, $a_eoc      # U :
+
+	beq	t7, $a_loop	# U :
+	nop
+	nop
+	nop
+
+	/* Take care of the final (partial) word store.  At this point
+	   the end-of-count bit is set in t7 iff it applies.
+
+	   On entry to this basic block we have:
+	   t0 == the source word containing the null
+	   t7 == the cmpgeb mask that found it.  */
+$a_eos:
+	negl	t7, t8		# E : find low bit set
+	and	t7, t8, t8	# E : (stall)
+	/* For the sake of the cache, don't read a destination word
+	   if we're not going to need it.  */
+	and	t8, 0x80, t6	# E : (stall)
+	bne	t6, 1f		# U : (stall)
+
+	/* We're doing a partial word store and so need to combine
+	   our source and original destination words.  */
+	ldl_u	t1, 0(a0)	# L :
+	subl	t8, 1, t6	# E :
+	or	t8, t6, t7	# E : (stall)
+	zapnot	t0, t7, t0	# U : clear src bytes > null (stall)
+
+	zap	t1, t7, t1	# .. e1 : clear dst bytes <= null
+	or	t0, t1, t0	# e1    : (stall)
+	nop
+	nop
+
+1:	stl_u	t0, 0(a0)	# L :
+	ret	(t9)		# L0 : Latency=3
+	nop
+	nop
+
+	/* Add the end-of-count bit to the eos detection bitmask.  */
+$a_eoc:
+	or	t10, t7, t7	# E :
+	br	$a_eos		# L0 : Latency=3
+	nop
+	nop
+
+	.align 4
+__stxncpy:
+	/* Are source and destination co-aligned?  */
+	ldi	t2, -1		# E :
+	xor	a0, a1, t1	# E :
+	and	a0, 7, t0	# E : find dest misalignment
+	nop			# E :
+
+	srl	t2, 1, t2	# U :
+	and	t1, 7, t1	# E :
+	sellt	a2, t2, a2, a2	# E : bound count to LONG_MAX (stall)
+	nop			# E :
+
+	addl	a2, t0, a2	# E : bias count by dest misalignment
+	subl	a2, 1, a2	# E : (stall)
+	and	a2, 7, t2	# E : (stall)
+	ldi	t10, 1		# E :
+
+	srl	a2, 3, a2	# U : a2 = loop counter = (count - 1)/8
+	sll	t10, t2, t10	# U : t10 = bitmask of last count byte
+	nop			# E :
+	bne	t1, $unaligned	# U : (stall)
+
+	/* We are co-aligned; take care of a partial first word.  */
+	ldl_u	t1, 0(a1)	# L : load first src word
+	addl	a1, 8, a1	# E :
+	beq	t0, stxncpy_aligned # U : avoid loading dest word if not needed
+	ldl_u	t0, 0(a0)	# L :
+
+	br	stxncpy_aligned	# U :
+	nop
+	nop
+	nop
+
+
+
+/* The source and destination are not co-aligned.  Align the destination
+   and cope.  We have to be very careful about not reading too much and
+   causing a SEGV.  */
+
+	.align 4
+$u_head:
+	/* We know just enough now to be able to assemble the first
+	   full source word.  We can still find a zero at the end of it
+	   that prevents us from outputting the whole thing.
+
+	   On entry to this basic block:
+	   t0 == the first dest word, unmasked
+	   t1 == the shifted low bits of the first source word
+	   t6 == bytemask that is -1 in dest word bytes */
+
+	ldl_u	t2, 8(a1)	# L : Latency=3 load second src word
+	addl	a1, 8, a1	# E :
+	mask3b	t0, a0, t0	# U : mask trailing garbage in dst
+	ext7b	t2, a1, t4	# U : (3 cycle stall on t2)
+
+	or	t1, t4, t1	# E : first aligned src word complete (stall)
+	mask7b	t1, a0, t1	# U : mask leading garbage in src (stall)
+	or	t0, t1, t0	# E : first output word complete (stall)
+	or	t0, t6, t6	# E : mask original data for zero test (stall)
+
+	cmpgeb	zero, t6, t7	# E :
+	beq	a2, $u_eocfin	# U :
+	ldi	t6, -1		# E :
+	nop
+
+	bne	t7, $u_final	# U :
+	mask3b	t6, a1, t6	# U : mask out bits already seen
+	stl_u	t0, 0(a0)	# L : store first output word
+	or      t6, t2, t2	# E :
+
+	cmpgeb	zero, t2, t7	# E : find nulls in second partial
+	addl	a0, 8, a0	# E :
+	subl	a2, 1, a2	# E :
+	bne	t7, $u_late_head_exit	# U :
+
+	/* Finally, we've got all the stupid leading edge cases taken care
+	   of and we can set up to enter the main loop.  */
+	ext3b	t2, a1, t1	# U : position hi-bits of lo word
+	beq	a2, $u_eoc	# U :
+	ldl_u	t2, 8(a1)	# L : read next high-order source word
+	addl	a1, 8, a1	# E :
+
+	ext7b	t2, a1, t0	# U : position lo-bits of hi word (stall)
+	cmpgeb	zero, t2, t7	# E :
+	nop
+	bne	t7, $u_eos	# U :
+
+	/* Unaligned copy main loop.  In order to avoid reading too much,
+	   the loop is structured to detect zeros in aligned source words.
+	   This has, unfortunately, effectively pulled half of a loop
+	   iteration out into the head and half into the tail, but it does
+	   prevent nastiness from accumulating in the very thing we want
+	   to run as fast as possible.
+
+	   On entry to this basic block:
+	   t0 == the shifted low-order bits from the current source word
+	   t1 == the shifted high-order bits from the previous source word
+	   t2 == the unshifted current source word
+
+	   We further know that t2 does not contain a null terminator.  */
+
+	.align 4
+$u_loop:
+	or	t0, t1, t0	# E : current dst word now complete
+	subl	a2, 1, a2	# E : decrement word count
+	ext3b	t2, a1, t1	# U : extract high bits for next time
+	addl	a0, 8, a0	# E :
+
+	stl_u	t0, -8(a0)	# L : save the current word
+	beq	a2, $u_eoc	# U :
+	ldl_u	t2, 8(a1)	# L : Latency=3 load high word for next time
+	addl	a1, 8, a1	# E :
+
+	ext7b	t2, a1, t0	# U : extract low bits (2 cycle stall)
+	cmpgeb	zero, t2, t7	# E : test new word for eos
+	nop
+	beq	t7, $u_loop	# U :
+
+	/* We've found a zero somewhere in the source word we just read.
+	   If it resides in the lower half, we have one (probably partial)
+	   word to write out, and if it resides in the upper half, we
+	   have one full and one partial word left to write out.
+
+	   On entry to this basic block:
+	   t0 == the shifted low-order bits from the current source word
+	   t1 == the shifted high-order bits from the previous source word
+	   t2 == the unshifted current source word.  */
+$u_eos:
+	or	t0, t1, t0	# E : first (partial) source word complete
+	nop
+	cmpgeb	zero, t0, t7	# E : is the null in this first bit? (stall)
+	bne	t7, $u_final	# U : (stall)
+
+	stl_u	t0, 0(a0)	# L : the null was in the high-order bits
+	addl	a0, 8, a0	# E :
+	subl	a2, 1, a2	# E :
+	nop
+
+$u_late_head_exit:
+	ext3b	t2, a1, t0	# U :
+	cmpgeb	zero, t0, t7	# E :
+	or	t7, t10, t6	# E : (stall)
+	seleq	a2, t6, t7, t7	# E : Latency=2, extra map slot (stall)
+
+	/* Take care of a final (probably partial) result word.
+	   On entry to this basic block:
+	   t0 == assembled source word
+	   t7 == cmpgeb mask that found the null.  */
+$u_final:
+	negl	t7, t6		# E : isolate low bit set
+	and	t6, t7, t8	# E : (stall)
+	and	t8, 0x80, t6	# E : avoid dest word load if we can (stall)
+	bne	t6, 1f		# U : (stall)
+
+	ldl_u	t1, 0(a0)	# L :
+	subl	t8, 1, t6	# E :
+	or	t6, t8, t7	# E : (stall)
+	zapnot	t0, t7, t0	# U : kill source bytes > null
+
+	zap	t1, t7, t1	# U : kill dest bytes <= null
+	or	t0, t1, t0	# E : (stall)
+	nop
+	nop
+
+1:	stl_u	t0, 0(a0)	# L :
+	ret	(t9)		# L0 : Latency=3
+
+	/* Got to end-of-count before end of string.
+	   On entry to this basic block:
+	   t1 == the shifted high-order bits from the previous source word  */
+$u_eoc:
+	and	a1, 7, t6	# E :
+	sll	t10, t6, t6	# U : (stall)
+	and	t6, 0xff, t6	# E : (stall)
+	bne	t6, 1f		# U : (stall)
+
+	ldl_u	t2, 8(a1)	# L : load final src word
+	nop
+	ext7b	t2, a1, t0	# U : extract low bits for last word (stall)
+	or	t1, t0, t1	# E : (stall)
+
+1:	cmpgeb	zero, t1, t7	# E :
+	mov	t1, t0
+
+$u_eocfin:			# end-of-count, final word
+	or	t10, t7, t7	# E :
+	br	$u_final	# L0 : Latency=3
+
+	/* Unaligned copy entry point.  */
+	.align 4
+$unaligned:
+
+	ldl_u	t1, 0(a1)	# L : load first source word
+	and	a0, 7, t4	# E : find dest misalignment
+	and	a1, 7, t5	# E : find src misalignment
+	/* Conditionally load the first destination word and a bytemask
+	   with 0xff indicating that the destination byte is sacrosanct.  */
+	mov	zero, t0	# E :
+
+	mov	zero, t6	# E :
+	beq	t4, 1f		# U :
+	ldl_u	t0, 0(a0)	# L :
+	ldi	t6, -1		# E :
+
+	mask3b	t6, a0, t6	# U :
+	nop
+	nop
+1:	subl	a1, t4, a1	# E : sub dest misalignment from src addr
+
+	/* If source misalignment is larger than dest misalignment, we need
+	   extra startup checks to avoid SEGV.  */
+
+	cmplt	t4, t5, t8	# E :
+	ext3b	t1, a1, t1	# U : shift src into place
+	ldi	t2, -1		# E : for creating masks later
+	beq	t8, $u_head	# U : (stall)
+
+	mask7b	t2, t5, t2	# U : begin src byte validity mask
+	cmpgeb	zero, t1, t7	# E : is there a zero?
+	ext3b	t2, a1, t2	# U :
+	or	t7, t10, t5	# E : test for end-of-count too
+
+	cmpgeb	zero, t2, t3	# E :
+	seleq	a2, t5, t7, t7	# E : Latency=2, extra map slot
+	nop			# E : keep with seleq
+	andnot	t7, t3, t7	# E : (stall)
+
+	beq	t7, $u_head	# U :
+	/* At this point we've found a zero in the first partial word of
+	   the source.  We need to isolate the valid source data and mask
+	   it into the original destination data.  (Incidentally, we know
+	   that we'll need at least one byte of that original dest word.) */
+	ldl_u	t0, 0(a0)	# L :
+	negl	t7, t6		# E : build bitmask of bytes <= zero
+	mask7b	t1, t4, t1	# U :
+
+	and	t6, t7, t8	# E :
+	subl	t8, 1, t6	# E : (stall)
+	or	t6, t8, t7	# E : (stall)
+	zapnot	t2, t7, t2	# U : prepare source word; mirror changes (stall)
+
+	zapnot	t1, t7, t1	# U : to source validity mask
+	andnot	t0, t2, t0	# E : zero place for source to reside
+	or	t0, t1, t0	# E : and put it there (stall both t0, t1)
+	stl_u	t0, 0(a0)	# L : (stall)
+
+	ret	(t9)		# L0 : Latency=3
+
+	cfi_endproc
diff --git a/sysdeps/sw_64/sw6b/sub_n.S b/sysdeps/sw_64/sw6b/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subl	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldl	$0,0($18)
+	ldl	$1,8($18)
+	ldl	$4,0($17)
+	ldl	$5,8($17)
+	addl	$17,32,$17		# update s1_ptr
+	ldl	$2,16($18)
+	subl	$4,$0,$20		# 1st main sub
+	ldl	$3,24($18)
+	subl	$19,4,$19		# decr loop cnt
+	ldl	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last sub
+	ldl	$7,-8($17)
+	addl	$1,$25,$28		# cy add
+	addl	$18,32,$18		# update s2_ptr
+	subl	$5,$28,$21		# 2nd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$5,$21,$25		# compute cy from last add
+	ldl	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two fadds
+	ldl	$1,8($18)
+	addl	$2,$25,$28		# cy add
+	ldl	$4,0($17)
+	subl	$6,$28,$22		# 3rd main sub
+	ldl	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	subl	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	addl	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	addl	$0,$25,$28		# cy add
+	ldl	$2,16($18)
+	subl	$4,$28,$20		# 1st main sub
+	ldl	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldl	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two fadds
+	subl	$19,4,$19		# decr loop cnt
+	stl	$22,-16($16)
+	addl	$1,$25,$28		# cy add
+	stl	$23,-8($16)
+	subl	$5,$28,$21		# 2nd main sub
+	addl	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$5,$21,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$2,$25,$28		# cy add
+	subl	$6,$28,$22		# 3rd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	subl	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	stl	$22,-16($16)
+	stl	$23,-8($16)
+.Lend2:	addl	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldl	$0,0($18)
+	ldl	$4,0($17)
+	subl	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addl	$0,$25,$28		# cy add
+	ldl	$0,8($18)
+	subl	$4,$28,$20		# main sub
+	ldl	$1,8($17)
+	addl	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addl	$17,8,$17
+	stl	$20,0($16)
+	cmpult	$4,$20,$25		# compute cy from last add
+	subl	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,8,$16
+	or	$1,$31,$4
+	bne	$19,.Loop0
+.Lend0:	addl	$0,$25,$28		# cy add
+	subl	$4,$28,$20		# main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$4,$20,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_sub_n
diff --git a/sysdeps/sw_64/sw8a/add_n.S b/sysdeps/sw_64/sw8a/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subl	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldl	$0,0($18)
+	ldl	$1,8($18)
+	ldl	$4,0($17)
+	ldl	$5,8($17)
+	addl	$17,32,$17		# update s1_ptr
+	ldl	$2,16($18)
+	addl	$0,$4,$20		# 1st main add
+	ldl	$3,24($18)
+	subl	$19,4,$19		# decr loop cnt
+	ldl	$6,-16($17)
+	cmpult	$20,$0,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	addl	$1,$25,$28		# cy add
+	addl	$18,32,$18		# update s2_ptr
+	addl	$5,$28,$21		# 2nd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
+	ldl	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two fadds
+	ldl	$1,8($18)
+	addl	$2,$25,$28		# cy add
+	ldl	$4,0($17)
+	addl	$28,$6,$22		# 3rd main add
+	ldl	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	addl	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	addl	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	addl	$0,$25,$28		# cy add
+	ldl	$2,16($18)
+	addl	$4,$28,$20		# 1st main add
+	ldl	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldl	$6,-16($17)
+	cmpult	$20,$28,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two fadds
+	subl	$19,4,$19		# decr loop cnt
+	stl	$22,-16($16)
+	addl	$1,$25,$28		# cy add
+	stl	$23,-8($16)
+	addl	$5,$28,$21		# 2nd main add
+	addl	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$21,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$2,$25,$28		# cy add
+	addl	$28,$6,$22		# 3rd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	addl	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	stl	$22,-16($16)
+	stl	$23,-8($16)
+.Lend2:	addl	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldl	$0,0($18)
+	ldl	$4,0($17)
+	subl	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addl	$0,$25,$28		# cy add
+	ldl	$0,8($18)
+	addl	$4,$28,$20		# main add
+	ldl	$4,8($17)
+	addl	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addl	$17,8,$17
+	stl	$20,0($16)
+	cmpult	$20,$28,$25		# compute cy from last add
+	subl	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,8,$16
+	bne	$19,.Loop0
+.Lend0:	addl	$0,$25,$28		# cy add
+	addl	$4,$28,$20		# main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$20,$28,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_add_n
diff --git a/sysdeps/sw_64/sw8a/addmul_1.S b/sysdeps/sw_64/sw8a/addmul_1.S
new file mode 100644
index 00000000..95487c26
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ #  Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ #  This file is part of the GNU MP Library.
+ #
+ #  The GNU MP Library is free software; you can redistribute it and/or modify
+ #  it under the terms of the GNU Lesser General Public License as published
+ #  by the Free Software Foundation; either version 2.1 of the License, or (at
+ #  your option) any later version.
+ #
+ #  The GNU MP Library is distributed in the hope that it will be useful, but
+ #  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ #  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ #  License for more details.
+ #
+ #  You should have received a copy of the GNU Lesser General Public License
+ #  along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+ #  INPUT PARAMETERS
+ #  res_ptr	$16
+ #  s1_ptr	$17
+ #  size	$18
+ #  s2_limb	$19
+ #
+ #
+ # This code was written in close cooperation with  pipeline expert
+ # .  Any errors are tege's fault, though.
+ #
+ #   Register usages for unrolled loop:
+ #	  0-3     mul's
+ #	  4-7     acc's
+ #	  8-15    mul results
+ #	  20,21   carry's
+ #	  22,23   save for stores
+ #
+ #   Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ #   The stores can issue a cycle late so we have paired no-op's to 'catch'
+ #   them, so that further disturbance to the schedule is damped.
+ #
+ #   We couldn't pair the loads, because the entangled schedule of the
+ #   carry's has to happen on one side {0} of the machine. Note, the total
+ #   use of U0, and the total use of L0 (after attending to the stores).
+ #   which is part of the reason why....
+ #
+ #   This is a great schedule for the d_cache, a poor schedule for the
+ #   b_cache. The lockup on U0 means that any stall can't be recovered
+ #   from. Consider a ldl in L1.  say that load gets stalled because it
+ #   collides with a fill from the b_Cache. On the next cycle, this load
+ #   gets priority. If first looks at L0, and goes there. The instruction
+ #   we intended for L0 gets to look at L1, which is NOT where we want
+ #   it. It either stalls 1, because it can't go in L0, or goes there, and
+ #   causes a further instruction to stall.
+ #
+ #   So for b_cache, we're likely going to want to put one or more cycles
+ #   back into the code! And, of course, put in prefetches. For the
+ #   accumulator, flds, intent to modify.  For the fmuldiplier, you might
+ #   want ldl, evict next, if you're not wanting to use it again soon. Use
+ #   256 ahead of present pointer value. At a place where we have an mt
+ #   followed by a bookkeeping, put the bookkeeping in upper, and the
+ #   prefetch into lower.
+ #
+ #   Note, the usage of physical registers per cycle is smoothed off, as
+ #   much as possible.
+ #
+ #   Note, the ldl's and stl's are at the end of the quadpacks.  note, we'd
+ #   like not to have a ldl or stl to preceded a conditional branch in a
+ #   quadpack. The conditional branch moves the retire pointer one cycle
+ #   later.
+ #
+ #   Optimization notes:
+ #   Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ #   Reserved regs:	 $29 $30 $31
+ #   Free caller-saves regs in unrolled code: $24 $25 $28
+ #   We should swap some of the callee-saves regs for some of the free
+ #   caller-saves regs, saving some overhead cycles.
+ #   Most importantly, we should write fast code for the 0-7 case.
+ #   The code we use there are for the 21164, and runs at 7 cycles/limb
+ #   on the 21264.  Should not be hard, if we write specialized code for
+ #   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
+ #   need a jump table indexed by the low 3 bits of the count argument.
+
+	.set	noreorder
+	.set	noat
+	.text
+
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.frame	$30,0,$26,0
+	.prologue 0
+
+	cmpult	$18,	8,	$1
+	beq	$1,	$Large
+
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$18,	1,	$18	# size--
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	umulh	$2,	$19,	$0	# $0 = prod_high
+	beq	$18,	$Lend0b		# jump if size was == 1
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$18,	1,	$18	# size--
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$4
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	beq	$18,	$Lend0a		# jump if size was == 2
+
+	.align 3
+$Loop0:	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	subl	$18,	1,	$18	# size--
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	bne	$18,	$Loop0
+$Lend0a:
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$5,	$0,	$0	# combine carries
+	addl	$4,	$0,	$0	# cy_limb = prod_high + cy
+	ret	$31,	($26),	1
+$Lend0b:
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$0,	$5,	$0
+	ret	$31,	($26),	1
+
+$Large:
+	ldi	$30,	-240($30)
+	stl	$9,	8($30)
+	stl	$10,	16($30)
+	stl	$11,	24($30)
+	stl	$12,	32($30)
+	stl	$13,	40($30)
+	stl	$14,	48($30)
+	stl	$15,	56($30)
+
+	and	$18,	7,	$20	# count for the first loop, 0-7
+	srl	$18,	3,	$18	# count for unrolled loop
+	bis	$31,	$31,	$0
+	beq	$20,	$Lunroll
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$20,	1,	$20	# size--
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	umulh	$2,	$19,	$0	# $0 = prod_high
+	beq	$20,	$Lend1b		# jump if size was == 1
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	subl	$20,	1,	$20	# size--
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$4
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	beq	$20,	$Lend1a		# jump if size was == 2
+
+	.align 3
+$Loop1:	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	subl	$20,	1,	$20	# size--
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	ldl	$2,	0($17)		# $2 = s1_limb
+	addl	$17,	8,	$17	# s1_ptr++
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	bne	$20,	$Loop1
+
+$Lend1a:
+	mull	$2,	$19,	$3	# $3 = prod_low
+	ldl	$5,	0($16)		# $5 = *res_ptr
+	addl	$4,	$0,	$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,	$19,	$4	# $4 = cy_limb
+	addl	$3,	$0,	$3	# $3 = cy_limb + prod_low
+	cmpult	$3,	$0,	$0	# $0 = carry from (cy_limb + prod_low)
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$5,	$0,	$0	# combine carries
+	addl	$4,	$0,	$0	# cy_limb = prod_high + cy
+	br	$31,	$Lunroll
+$Lend1b:
+	addl	$5,	$3,	$3
+	cmpult	$3,	$5,	$5
+	stl	$3,	0($16)
+	addl	$16,	8,	$16	# res_ptr++
+	addl	$0,	$5,	$0
+
+$Lunroll:
+	ldi	$17,	-16($17)	# L1 bookkeeping
+	ldi	$16,	-16($16)	# L1 bookkeeping
+	bis	$0,	$31,	$12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+	ldl	$2,	16($17)		# L1
+	ldl	$3,	24($17)		# L1
+	ldi	$18,	-1($18)		# L1 bookkeeping
+	ldl	$6,	16($16)		# L1
+	ldl	$7,	24($16)		# L1
+	ldl	$0,	32($17)		# L1
+	mull	$19,	$2,	$13	# U1
+	ldl	$1,	40($17)		# L1
+	umulh	$19,	$2,	$14	# U1
+	mull	$19,	$3,	$15	# U1
+	ldi	$17,	64($17)		# L1 bookkeeping
+	ldl	$4,	32($16)		# L1
+	ldl	$5,	40($16)		# L1
+	umulh	$19,	$3,	$8	# U1
+	ldl	$2,	-16($17)	# L1
+	mull	$19,	$0,	$9	# U1
+	ldl	$3,	-8($17)		# L1
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	mull	$19,	$1,	$11	# U1
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	ldi	$16,	64($16)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	-16($16)	# L1
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	-8($16)		# L1
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	0($17)		# L1
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	8($17)		# L1
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# L0 lo + acc
+	stl	$22,	-48($16)	# L0
+	stl	$23,	-40($16)	# L1
+	mull	$19,	$3,	$15	# U1
+	addl	$8,	$21,	$8	# U0 hi mul + carry
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	ble	$18,	$Lend		# U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+	.align 4
+$Loop:
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	0($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	8($16)		# L1
+
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	ldl	$2,	16($17)		# L1
+
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	ldl	$3,	24($17)		# L1
+
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	-32($16)	# L0
+	stl	$23,	-24($16)	# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$1,	$11	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$18,	-1($18)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	16($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	24($16)		# L1
+
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	32($17)		# L1
+
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	40($17)		# L1
+
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# U0 lo + acc
+	stl	$22,	-16($16)	# L0
+	stl	$23,	-8($16)		# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$3,	$15	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$8,	$21,	$8	# L0 hi mul + carry
+
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$17,	64($17)		# L1 bookkeeping
+	addl	$4,	$8,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	32($16)		# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	40($16)		# L1
+
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	ldl	$2,	-16($17)	# L1
+
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	ldl	$3,	-8($17)		# L1
+
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	0($16)		# L0
+	stl	$23,	8($16)		# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$1,	$11	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	bis	$31,	$31,	$31	# U1 mt
+	ldi	$16,	64($16)		# L1 bookkeeping
+	addl	$6,	$12,	$22	# U0 hi add => answer
+
+	bis	$31,	$31,	$31	# U1 mt
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	ldl	$6,	-16($16)	# L1
+
+	bis	$31,	$31,	$31	# U1 mt
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	ldl	$7,	-8($16)		# L1
+
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	ldl	$0,	0($17)		# L1
+
+	mull	$19,	$2,	$13	# U1
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	ldl	$1,	8($17)		# L1
+
+	umulh	$19,	$2,	$14	# U1
+	addl	$4,	$9,	$4	# L0 lo + acc
+	stl	$22,	-48($16)	# L0
+	stl	$23,	-40($16)	# L1
+
+	bis	$31,	$31,	$31	# L0 st slosh
+	mull	$19,	$3,	$15	# U1
+	bis	$31,	$31,	$31	# L1 st slosh
+	addl	$8,	$21,	$8	# U0 hi mul + carry
+
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	bis	$31,	$31,	$31	# L1 mt
+	bgt	$18,	$Loop		# U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	ldl	$4,	0($16)		# L1
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	ldl	$5,	8($16)		# L1
+	umulh	$19,	$3,	$8	# U1
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	mull	$19,	$0,	$9	# U1
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	umulh	$19,	$0,	$10	# U1
+	addl	$6,	$13,	$6	# L0 lo + acc
+	stl	$22,	-32($16)	# L0
+	stl	$23,	-24($16)	# L1
+	mull	$19,	$1,	$11	# U1
+	addl	$12,	$21,	$12	# U0 hi mul + carry
+	cmpult	$6,	$13,	$20	# L0 lo add => carry
+	addl	$6,	$12,	$22	# U0 hi add => answer
+	cmpult	$22,	$12,	$21	# L0 hi add => carry
+	addl	$14,	$20,	$14	# U0 hi mul + carry
+	addl	$7,	$15,	$23	# L0 lo + acc
+	addl	$14,	$21,	$14	# U0 hi mul + carry
+	umulh	$19,	$1,	$12	# U1
+	cmpult	$23,	$15,	$20	# L0 lo add => carry
+	addl	$23,	$14,	$23	# U0 hi add => answer
+	cmpult	$23,	$14,	$21	# L0 hi add => carry
+	addl	$8,	$20,	$8	# U0 hi mul + carry
+	addl	$4,	$9,	$4	# U0 lo + acc
+	stl	$22,	-16($16)	# L0
+	stl	$23,	-8($16)		# L1
+	bis	$31,	$31,	$31	# L0 st slosh
+	addl	$8,	$21,	$8	# L0 hi mul + carry
+	cmpult	$4,	$9,	$20	# L0 lo add => carry
+	addl	$4,	$8,	$22	# U0 hi add => answer
+	cmpult	$22,	$8,	$21	# L0 hi add => carry
+	addl	$10,	$20,	$10	# U0 hi mul + carry
+	addl	$5,	$11,	$23	# L0 lo + acc
+	addl	$10,	$21,	$10	# L0 hi mul + carry
+	cmpult	$23,	$11,	$20	# L0 lo add => carry
+	addl	$23,	$10,	$23	# U0 hi add => answer
+	cmpult	$23,	$10,	$21	# L0 hi add => carry
+	addl	$12,	$20,	$12	# U0 hi mul + carry
+	stl	$22,	0($16)		# L0
+	stl	$23,	8($16)		# L1
+	addl	$12,	$21,	$0	# U0 hi mul + carry
+
+	ldl	$9,	8($30)
+	ldl	$10,	16($30)
+	ldl	$11,	24($30)
+	ldl	$12,	32($30)
+	ldl	$13,	40($30)
+	ldl	$14,	48($30)
+	ldl	$15,	56($30)
+	ldi	$30,	240($30)
+	ret	$31,	($26),	1
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/sw_64/sw8a/lshift.S b/sysdeps/sw_64/sw8a/lshift.S
new file mode 100644
index 00000000..76f1fb0e
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addl	$18,$17,$17	# make r17 point at end of s1
+	ldl	$4,-8($17)	# load first limb
+	subl	$31,$19,$20
+	s8addl	$18,$16,$16	# make r16 point at end of RES
+	subl	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	srl	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subl	$18,$28,$18
+
+	.align	3
+.Loop0:	ldl	$3,-16($17)
+	subl	$16,8,$16
+	sll	$4,$19,$5
+	subl	$17,8,$17
+	subl	$28,1,$28
+	srl	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stl	$8,0($16)
+	bne	$28,.Loop0
+
+.L0:	sll	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldl	$1,-16($17)
+	subl	$18,4,$18
+	ldl	$2,-24($17)
+	ldl	$3,-32($17)
+	ldl	$4,-40($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	ldl	$1,-48($17)
+	sll	$2,$19,$22
+	ldl	$2,-56($17)
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	ldl	$3,-64($17)
+	sll	$4,$19,$24
+	ldl	$4,-72($17)
+	subl	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+
+	srl	$1,$20,$7
+	subl	$18,4,$18
+	sll	$1,$19,$21
+	unop	# ldl	$31,-96($17)
+
+	srl	$2,$20,$8
+	ldl	$1,-80($17)
+	sll	$2,$19,$22
+	ldl	$2,-88($17)
+
+	stl	$5,-24($16)
+	or	$7,$24,$7
+	stl	$6,-32($16)
+	or	$8,$21,$8
+
+	srl	$3,$20,$5
+	unop	# ldl	$31,-96($17)
+	sll	$3,$19,$23
+	subl	$16,32,$16
+
+	srl	$4,$20,$6
+	ldl	$3,-96($17)
+	sll	$4,$19,$24
+	ldl	$4,-104($17)
+
+	subl	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	stl	$5,-24($16)
+	or	$7,$24,$7
+	stl	$6,-32($16)
+	or	$8,$21,$8
+	srl	$3,$20,$5
+	sll	$3,$19,$23
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 2/2
+	stl	$7,-40($16)
+	or	$5,$22,$5
+	stl	$8,-48($16)
+	or	$6,$23,$6
+	stl	$5,-56($16)
+	stl	$6,-64($16)
+ # cool down phase 2/3
+	stl	$24,-72($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 1/2
+	stl	$7,-8($16)
+	or	$5,$22,$5
+	stl	$8,-16($16)
+	or	$6,$23,$6
+	stl	$5,-24($16)
+	stl	$6,-32($16)
+	stl	$24,-40($16)
+	ret	$31,($26),1
+
+.Lend:	stl	$24,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/sysdeps/sw_64/sw8a/rshift.S b/sysdeps/sw_64/sw8a/rshift.S
new file mode 100644
index 00000000..ec2a78b0
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64  __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldl	$4,0($17)	# load first limb
+	subl	$31,$19,$20
+	subl	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	sll	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subl	$18,$28,$18
+
+	.align	3
+.Loop0:	ldl	$3,8($17)
+	addl	$16,8,$16
+	srl	$4,$19,$5
+	addl	$17,8,$17
+	subl	$28,1,$28
+	sll	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stl	$8,-8($16)
+	bne	$28,.Loop0
+
+.L0:	srl	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldl	$1,8($17)
+	subl	$18,4,$18
+	ldl	$2,16($17)
+	ldl	$3,24($17)
+	ldl	$4,32($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	ldl	$1,40($17)
+	srl	$2,$19,$22
+	ldl	$2,48($17)
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	ldl	$3,56($17)
+	srl	$4,$19,$24
+	ldl	$4,64($17)
+	subl	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+
+	sll	$1,$20,$7
+	subl	$18,4,$18
+	srl	$1,$19,$21
+	unop	# ldl	$31,-96($17)
+
+	sll	$2,$20,$8
+	ldl	$1,72($17)
+	srl	$2,$19,$22
+	ldl	$2,80($17)
+
+	stl	$5,16($16)
+	or	$7,$24,$7
+	stl	$6,24($16)
+	or	$8,$21,$8
+
+	sll	$3,$20,$5
+	unop	# ldl	$31,-96($17)
+	srl	$3,$19,$23
+	addl	$16,32,$16
+
+	sll	$4,$20,$6
+	ldl	$3,88($17)
+	srl	$4,$19,$24
+	ldl	$4,96($17)
+
+	addl	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	stl	$5,16($16)
+	or	$7,$24,$7
+	stl	$6,24($16)
+	or	$8,$21,$8
+	sll	$3,$20,$5
+	srl	$3,$19,$23
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 2/2
+	stl	$7,32($16)
+	or	$5,$22,$5
+	stl	$8,40($16)
+	or	$6,$23,$6
+	stl	$5,48($16)
+	stl	$6,56($16)
+ # cool down phase 2/3
+	stl	$24,64($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 1/2
+	stl	$7,0($16)
+	or	$5,$22,$5
+	stl	$8,8($16)
+	or	$6,$23,$6
+	stl	$5,16($16)
+	stl	$6,24($16)
+	stl	$24,32($16)
+	ret	$31,($26),1
+
+.Lend:	stl	$24,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/sysdeps/sw_64/sw8a/sub_n.S b/sysdeps/sw_64/sw8a/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subl	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldl	$0,0($18)
+	ldl	$1,8($18)
+	ldl	$4,0($17)
+	ldl	$5,8($17)
+	addl	$17,32,$17		# update s1_ptr
+	ldl	$2,16($18)
+	subl	$4,$0,$20		# 1st main sub
+	ldl	$3,24($18)
+	subl	$19,4,$19		# decr loop cnt
+	ldl	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last sub
+	ldl	$7,-8($17)
+	addl	$1,$25,$28		# cy add
+	addl	$18,32,$18		# update s2_ptr
+	subl	$5,$28,$21		# 2nd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$5,$21,$25		# compute cy from last add
+	ldl	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two fadds
+	ldl	$1,8($18)
+	addl	$2,$25,$28		# cy add
+	ldl	$4,0($17)
+	subl	$6,$28,$22		# 3rd main sub
+	ldl	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	subl	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	addl	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	addl	$0,$25,$28		# cy add
+	ldl	$2,16($18)
+	subl	$4,$28,$20		# 1st main sub
+	ldl	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldl	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last add
+	ldl	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two fadds
+	subl	$19,4,$19		# decr loop cnt
+	stl	$22,-16($16)
+	addl	$1,$25,$28		# cy add
+	stl	$23,-8($16)
+	subl	$5,$28,$21		# 2nd main sub
+	addl	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$5,$21,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$2,$25,$28		# cy add
+	subl	$6,$28,$22		# 3rd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+	stl	$21,8($16)
+	addl	$3,$25,$28		# cy add
+	subl	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,32,$16		# update res_ptr
+	stl	$22,-16($16)
+	stl	$23,-8($16)
+.Lend2:	addl	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldl	$0,0($18)
+	ldl	$4,0($17)
+	subl	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addl	$0,$25,$28		# cy add
+	ldl	$0,8($18)
+	subl	$4,$28,$20		# main sub
+	ldl	$1,8($17)
+	addl	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addl	$17,8,$17
+	stl	$20,0($16)
+	cmpult	$4,$20,$25		# compute cy from last add
+	subl	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two fadds
+	addl	$16,8,$16
+	or	$1,$31,$4
+	bne	$19,.Loop0
+.Lend0:	addl	$0,$25,$28		# cy add
+	subl	$4,$28,$20		# main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$4,$20,$25		# compute cy from last add
+	stl	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two fadds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_sub_n
diff --git a/sysdeps/sw_64/udiv_qrnnd.S b/sysdeps/sw_64/udiv_qrnnd.S
new file mode 100644
index 00000000..054034cd
--- /dev/null
+++ b/sysdeps/sw_64/udiv_qrnnd.S
@@ -0,0 +1,159 @@
+ # Sw_64 1621 __udiv_qrnnd
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+	.set noreorder
+	.set noat
+
+	.text
+
+LEAF(__udiv_qrnnd, 0)
+#ifdef PROF
+	ldgp	gp, 0(pv)
+	ldi	AT, _mcount
+	call	AT, (AT), _mcount
+	.prologue 1
+#else
+	.prologue 0
+#endif
+
+#define cnt	$2
+#define tmp	$3
+#define rem_ptr	$16
+#define n1	$17
+#define n0	$18
+#define d	$19
+#define qb	$20
+
+	ldi	cnt,16
+	blt	d,$largedivisor
+
+$loop1:	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	d,n1,qb
+	subl	n1,d,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	d,n1,qb
+	subl	n1,d,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	d,n1,qb
+	subl	n1,d,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	d,n1,qb
+	subl	n1,d,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	subl	cnt,1,cnt
+	bgt	cnt,$loop1
+	stl	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+$largedivisor:
+	and	n0,1,$4
+
+	srl	n0,1,n0
+	sll	n1,63,tmp
+	or	tmp,n0,n0
+	srl	n1,1,n1
+
+	and	d,1,$6
+	srl	d,1,$5
+	addl	$5,$6,$5
+
+$loop2:	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	$5,n1,qb
+	subl	n1,$5,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	$5,n1,qb
+	subl	n1,$5,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	$5,n1,qb
+	subl	n1,$5,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addl	n1,n1,n1
+	bis	n1,tmp,n1
+	addl	n0,n0,n0
+	cmpule	$5,n1,qb
+	subl	n1,$5,tmp
+	selne	qb,tmp,n1,n1
+	bis	n0,qb,n0
+	subl	cnt,1,cnt
+	bgt	cnt,$loop2
+
+	addl	n1,n1,n1
+	addl	$4,n1,n1
+	bne	$6,$Odd
+	stl	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+$Odd:
+	/* q' in n0. r' in n1 */
+	addl	n1,n0,n1
+
+	cmpult	n1,n0,tmp	# tmp := carry from addl
+	subl	n1,d,AT
+	addl	n0,tmp,n0
+	selne	tmp,AT,n1,n1
+
+	cmpult	n1,d,tmp
+	addl	n0,1,AT
+	seleq	tmp,AT,n0,n0
+	subl	n1,d,AT
+	seleq	tmp,AT,n1,n1
+
+	stl	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+	.end	__udiv_qrnnd
--
2.25.1