代码拉取完成,页面将自动刷新
同步操作将从 src-openEuler/glibc 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
From 8045463341b2495da7b2e7dc308a023764315bbe Mon Sep 17 00:00:00 2001
From: swcompiler <lc@wxiat.com>
Date: Fri, 29 Nov 2024 14:15:45 +0800
Subject: [PATCH 11/23] Sw64: Integer Operation Support
---
sysdeps/sw_64/add_n.S | 118 +++++++++
sysdeps/sw_64/addmul_1.S | 89 +++++++
sysdeps/sw_64/bzero.S | 107 ++++++++
sysdeps/sw_64/div.S | 83 ++++++
sysdeps/sw_64/div_libc.h | 170 ++++++++++++
sysdeps/sw_64/divl.S | 96 +++++++
sysdeps/sw_64/divlu.S | 4 +
sysdeps/sw_64/divq.S | 290 +++++++++++++++++++++
sysdeps/sw_64/divqu.S | 292 +++++++++++++++++++++
sysdeps/sw_64/htonl.S | 43 +++
sysdeps/sw_64/htons.S | 39 +++
sysdeps/sw_64/ldiv.S | 222 ++++++++++++++++
sysdeps/sw_64/lldiv.S | 1 +
sysdeps/sw_64/lshift.S | 107 ++++++++
sysdeps/sw_64/mul_1.S | 82 ++++++
sysdeps/sw_64/reml.S | 93 +++++++
sysdeps/sw_64/remlu.S | 4 +
sysdeps/sw_64/remq.S | 274 ++++++++++++++++++++
sysdeps/sw_64/remqu.S | 292 +++++++++++++++++++++
sysdeps/sw_64/rshift.S | 105 ++++++++
sysdeps/sw_64/sub_n.S | 118 +++++++++
sysdeps/sw_64/submul_1.S | 89 +++++++
sysdeps/sw_64/sw6a/add_n.S | 146 +++++++++++
sysdeps/sw_64/sw6a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
sysdeps/sw_64/sw6a/lshift.S | 172 ++++++++++++
sysdeps/sw_64/sw6a/rshift.S | 170 ++++++++++++
sysdeps/sw_64/sw6a/sub_n.S | 147 +++++++++++
sysdeps/sw_64/sw6b/add_n.S | 146 +++++++++++
sysdeps/sw_64/sw6b/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
sysdeps/sw_64/sw6b/lshift.S | 172 ++++++++++++
sysdeps/sw_64/sw6b/memcpy.S | 416 +++++++++++++++++++++++++++++
sysdeps/sw_64/sw6b/memset.S | 312 ++++++++++++++++++++++
sysdeps/sw_64/sw6b/rshift.S | 170 ++++++++++++
sysdeps/sw_64/sw6b/stxcpy.S | 314 ++++++++++++++++++++++
sysdeps/sw_64/sw6b/stxncpy.S | 392 ++++++++++++++++++++++++++++
sysdeps/sw_64/sw6b/sub_n.S | 147 +++++++++++
sysdeps/sw_64/sw8a/add_n.S | 146 +++++++++++
sysdeps/sw_64/sw8a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
sysdeps/sw_64/sw8a/lshift.S | 172 ++++++++++++
sysdeps/sw_64/sw8a/rshift.S | 170 ++++++++++++
sysdeps/sw_64/sw8a/sub_n.S | 147 +++++++++++
sysdeps/sw_64/udiv_qrnnd.S | 159 ++++++++++++
42 files changed, 7641 insertions(+)
create mode 100644 sysdeps/sw_64/add_n.S
create mode 100644 sysdeps/sw_64/addmul_1.S
create mode 100644 sysdeps/sw_64/bzero.S
create mode 100644 sysdeps/sw_64/div.S
create mode 100644 sysdeps/sw_64/div_libc.h
create mode 100644 sysdeps/sw_64/divl.S
create mode 100644 sysdeps/sw_64/divlu.S
create mode 100644 sysdeps/sw_64/divq.S
create mode 100644 sysdeps/sw_64/divqu.S
create mode 100644 sysdeps/sw_64/htonl.S
create mode 100644 sysdeps/sw_64/htons.S
create mode 100644 sysdeps/sw_64/ldiv.S
create mode 100644 sysdeps/sw_64/lldiv.S
create mode 100644 sysdeps/sw_64/lshift.S
create mode 100644 sysdeps/sw_64/mul_1.S
create mode 100644 sysdeps/sw_64/reml.S
create mode 100644 sysdeps/sw_64/remlu.S
create mode 100644 sysdeps/sw_64/remq.S
create mode 100644 sysdeps/sw_64/remqu.S
create mode 100644 sysdeps/sw_64/rshift.S
create mode 100644 sysdeps/sw_64/sub_n.S
create mode 100644 sysdeps/sw_64/submul_1.S
create mode 100644 sysdeps/sw_64/sw6a/add_n.S
create mode 100644 sysdeps/sw_64/sw6a/addmul_1.S
create mode 100644 sysdeps/sw_64/sw6a/lshift.S
create mode 100644 sysdeps/sw_64/sw6a/rshift.S
create mode 100644 sysdeps/sw_64/sw6a/sub_n.S
create mode 100644 sysdeps/sw_64/sw6b/add_n.S
create mode 100644 sysdeps/sw_64/sw6b/addmul_1.S
create mode 100644 sysdeps/sw_64/sw6b/lshift.S
create mode 100644 sysdeps/sw_64/sw6b/memcpy.S
create mode 100644 sysdeps/sw_64/sw6b/memset.S
create mode 100644 sysdeps/sw_64/sw6b/rshift.S
create mode 100644 sysdeps/sw_64/sw6b/stxcpy.S
create mode 100644 sysdeps/sw_64/sw6b/stxncpy.S
create mode 100644 sysdeps/sw_64/sw6b/sub_n.S
create mode 100644 sysdeps/sw_64/sw8a/add_n.S
create mode 100644 sysdeps/sw_64/sw8a/addmul_1.S
create mode 100644 sysdeps/sw_64/sw8a/lshift.S
create mode 100644 sysdeps/sw_64/sw8a/rshift.S
create mode 100644 sysdeps/sw_64/sw8a/sub_n.S
create mode 100644 sysdeps/sw_64/udiv_qrnnd.S
diff --git a/sysdeps/sw_64/add_n.S b/sysdeps/sw_64/add_n.S
new file mode 100644
index 00000000..8c5c8c08
--- /dev/null
+++ b/sysdeps/sw_64/add_n.S
@@ -0,0 +1,118 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ ldl $3,0($17)
+ ldl $4,0($18)
+
+ subl $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop
+
+ subl $19,$2,$19
+
+.Loop0: subl $2,1,$2
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ addl $17,8,$17
+ addl $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addl $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subl $19,4,$19
+
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ ldl $3,16($17)
+ addl $6,$0,$6
+ ldl $4,16($18)
+ cmpult $6,$0,$1
+ addl $5,$6,$6
+ cmpult $6,$5,$0
+ stl $6,8($16)
+ or $0,$1,$0
+
+ ldl $5,24($17)
+ addl $4,$0,$4
+ ldl $6,24($18)
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,16($16)
+ or $0,$1,$0
+
+ ldl $3,32($17)
+ addl $6,$0,$6
+ ldl $4,32($18)
+ cmpult $6,$0,$1
+ addl $5,$6,$6
+ cmpult $6,$5,$0
+ stl $6,24($16)
+ or $0,$1,$0
+
+ addl $17,32,$17
+ addl $18,32,$18
+ addl $16,32,$16
+ bne $19,.Loop
+
+.Lend: addl $4,$0,$4
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/addmul_1.S b/sysdeps/sw_64/addmul_1.S
new file mode 100644
index 00000000..138e3c69
--- /dev/null
+++ b/sysdeps/sw_64/addmul_1.S
@@ -0,0 +1,89 @@
+ # Sw_64 1621 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1 2
+__mpn_addmul_1:
+ .frame $30,0,$26
+
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ addl $5,$3,$3
+ cmpult $3,$5,$4
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subl $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addl $5,$3,$3
+ cmpult $3,$5,$5
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ addl $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addl $5,$3,$3
+ cmpult $3,$5,$5
+ stl $3,0($16)
+ addl $5,$0,$0 # combine carries
+ addl $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: addl $5,$3,$3
+ cmpult $3,$5,$5
+ stl $3,0($16)
+ addl $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/bzero.S b/sysdeps/sw_64/bzero.S
new file mode 100644
index 00000000..1a020afd
--- /dev/null
+++ b/sysdeps/sw_64/bzero.S
@@ -0,0 +1,107 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Fill a block of memory with zeros. Optimized for the Sw_64 architecture:
+
+ - memory accessed as aligned quadwords only
+ - destination memory not read unless needed for good cache behaviour
+ - basic blocks arranged to optimize branch prediction for full-quadword
+ aligned memory blocks.
+ - partial head and tail quadwords constructed with byte-mask instructions
+
+*/
+
+
+#include <sysdep.h>
+
+ .set noat
+ .set noreorder
+
+ .text
+ .type __bzero, @function
+ .globl __bzero
+ .usepv __bzero, USEPV_PROF
+
+ cfi_startproc
+
+ /* On entry to this basic block:
+ t3 == loop counter
+ t4 == bytes in partial final word
+ a0 == possibly misaligned destination pointer */
+
+ .align 3
+bzero_loop:
+ beq t3, $tail #
+ blbc t3, 0f # skip single store if count even
+
+ stl_u zero, 0(a0) # e0 : store one word
+ subl t3, 1, t3 # .. e1 :
+ addl a0, 8, a0 # e0 :
+ beq t3, $tail # .. e1 :
+
+0: stl_u zero, 0(a0) # e0 : store two words
+ subl t3, 2, t3 # .. e1 :
+ stl_u zero, 8(a0) # e0 :
+ addl a0, 16, a0 # .. e1 :
+ bne t3, 0b # e1 :
+
+$tail: bne t4, 1f # is there a tail to do?
+ ret # no
+
+1: ldl_u t0, 0(a0) # yes, load original data
+ mask7b t0, t4, t0 #
+ stl_u t0, 0(a0) #
+ ret #
+
+__bzero:
+#ifdef PROF
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+#endif
+
+ mov a0, v0 # e0 : move return value in place
+ beq a1, $done # .. e1 : early exit for zero-length store
+ and a0, 7, t1 # e0 :
+ addl a1, t1, a1 # e1 : add dest misalignment to count
+ srl a1, 3, t3 # e0 : loop = count >> 3
+ and a1, 7, t4 # .. e1 : find number of bytes in tail
+ unop # :
+ beq t1, bzero_loop # e1 : aligned head, jump right in
+
+ ldl_u t0, 0(a0) # e0 : load original data to mask into
+ cmpult a1, 8, t2 # .. e1 : is this a sub-word set
+ bne t2, $oneq # e1 :
+
+ mask3b t0, a0, t0 # e0 : we span words. finish this partial
+ subl t3, 1, t3 # e0 :
+ addl a0, 8, a0 # .. e1 :
+ stl_u t0, -8(a0) # e0 :
+ br bzero_loop # .. e1 :
+
+ .align 3
+$oneq:
+ mask3b t0, a0, t2 # e0 :
+ mask7b t0, a1, t3 # e0 :
+ or t2, t3, t0 # e1 :
+ stl_u t0, 0(a0) # e0 :
+
+$done: ret
+
+ cfi_endproc
+weak_alias (__bzero, bzero)
diff --git a/sysdeps/sw_64/div.S b/sysdeps/sw_64/div.S
new file mode 100644
index 00000000..6dbdcb7f
--- /dev/null
+++ b/sysdeps/sw_64/div.S
@@ -0,0 +1,83 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Richard Henderson <rth@tamu.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+#undef FRAME
+#ifdef __sw_64_fix__
+#define FRAME 0
+#else
+#define FRAME 16
+#endif
+
+ .set noat
+
+ .align 4
+ .globl div
+ .ent div
+div:
+ .frame sp, FRAME, ra
+#if FRAME > 0
+ ldi sp, -FRAME(sp)
+#endif
+#ifdef PROF
+ .set macro
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set nomacro
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ beq $18, $divbyzero
+ rfpcr $f10
+ _ITOFT2 $17, $f0, 0, $18, $f1, 8
+ fcvtld $f0, $f11
+ fcvtld $f1, $f12
+ fdivd $f11, $f12, $f1
+ fcvtdl_z $f1, $f0
+ wfpcr $f10
+ _FTOIT $f0, $0, 0
+
+ mulw $0, $18, $1
+ subw $17, $1, $1
+
+ stw $0, 0(a0)
+ stw $1, 4(a0)
+ mov a0, v0
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+$divbyzero:
+ mov a0, v0
+ ldi a0, GEN_INTDIV
+ sys_call HMC_gentrap
+ stw zero, 0(v0)
+ stw zero, 4(v0)
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+ .end div
diff --git a/sysdeps/sw_64/div_libc.h b/sysdeps/sw_64/div_libc.h
new file mode 100644
index 00000000..2066924b
--- /dev/null
+++ b/sysdeps/sw_64/div_libc.h
@@ -0,0 +1,170 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Common bits for implementing software divide. */
+
+#include <sysdep.h>
+#ifdef __linux__
+# include <asm/gentrap.h>
+# include <asm/hmcall.h>
+#else
+# include <machine/pal.h>
+#endif
+
+/* These are not normal C functions. Argument registers are t10 and t11;
+ the result goes in t12; the return address is in t9. Only t12 and AT
+ may be clobbered. */
+#define X t10
+#define Y t11
+#define RV t12
+#define RA t9
+
+/* The secureplt format does not allow the division routines to be called
+ via plt; there aren't enough registers free to be clobbered. Avoid
+ setting the symbol type to STT_FUNC, so that the linker won't be tempted
+ to create a plt entry. */
+#define funcnoplt notype
+
+/* None of these functions should use implicit anything. */
+ .set nomacro
+ .set noat
+
+/* Code fragment to invoke _mcount for profiling. This should be invoked
+ directly after allocation of the stack frame. */
+.macro CALL_MCOUNT
+#ifdef PROF
+ stl ra, 0(sp)
+ stl pv, 8(sp)
+ stl gp, 16(sp)
+ cfi_rel_offset (ra, 0)
+ cfi_rel_offset (pv, 8)
+ cfi_rel_offset (gp, 16)
+ br AT, 1f
+ .set macro
+1: ldgp gp, 0(AT)
+ mov RA, ra
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set nomacro
+ ldl ra, 0(sp)
+ ldl pv, 8(sp)
+ ldl gp, 16(sp)
+ cfi_restore (ra)
+ cfi_restore (pv)
+ cfi_restore (gp)
+ /* Realign subsequent code with what we'd have without this
+ macro at all. This means aligned with one arithmetic insn
+ used within the bundle. */
+ .align 4
+ nop
+#endif
+.endm
+
+/* In order to make the below work, all top-level divide routines must
+ use the same frame size. */
+#define FRAME 96
+
+/* Code fragment to generate an integer divide-by-zero fault. When
+ building libc.so, we arrange for there to be one copy of this code
+ placed late in the dso, such that all branches are forward. When
+ building libc.a, we use multiple copies to avoid having an out of
+ range branch. Users should jump to DIVBYZERO. */
+
+.macro DO_DIVBYZERO
+#ifdef PIC
+#define DIVBYZERO __divbyzero
+ .section .gnu.linkonce.t.divbyzero, "ax", @progbits
+ .globl __divbyzero
+ .type __divbyzero, @function
+ .usepv __divbyzero, no
+ .hidden __divbyzero
+#else
+#define DIVBYZERO $divbyzero
+#endif
+
+ .align 4
+DIVBYZERO:
+ cfi_startproc
+ cfi_return_column (RA)
+ cfi_def_cfa_offset (FRAME)
+
+ mov a0, RV
+ unop
+ ldi a0, GEN_INTDIV
+ sys_call HMC_gentrap
+
+ mov RV, a0
+ clr RV
+ ldi sp, FRAME(sp)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size DIVBYZERO, .-DIVBYZERO
+.endm
+
+/* Like the sw6a instructions, but fall back to stack use on prior machines. */
+#ifdef __sw_64_sw6a__
+ .arch sw6a
+#endif
+#ifdef __sw_64_sw6b__
+ .arch sw6b
+#endif
+#ifdef __sw_64_sw8a__
+ .arch sw8a
+#endif
+
+.macro _ITOFS gr, fr, slot
+#ifdef __sw_64_fix__
+ ifmovs \gr, \fr
+#else
+ stw \gr, \slot(sp)
+ flds \fr, \slot(sp)
+#endif
+.endm
+
+.macro _ITOFT gr, fr, slot
+#ifdef __sw_64_fix__
+ ifmovd \gr, \fr
+#else
+ stl \gr, \slot(sp)
+ fldd \fr, \slot(sp)
+#endif
+.endm
+
+.macro _FTOIT fr, gr, slot
+#ifdef __sw_64_fix__
+ fimovd \fr, \gr
+#else
+ fstd \fr, \slot(sp)
+ ldl \gr, \slot(sp)
+#endif
+.endm
+
+/* Similarly, but move two registers. Schedules better for pre-sw6a. */
+
+.macro _ITOFT2 gr1, fr1, slot1, gr2, fr2, slot2
+#ifdef __sw_64_fix__
+ ifmovd \gr1, \fr1
+ ifmovd \gr2, \fr2
+#else
+ stl \gr1, \slot1(sp)
+ stl \gr2, \slot2(sp)
+ fldd \fr1, \slot1(sp)
+ fldd \fr2, \slot2(sp)
+#endif
+.endm
diff --git a/sysdeps/sw_64/divl.S b/sysdeps/sw_64/divl.S
new file mode 100644
index 00000000..1192a0aa
--- /dev/null
+++ b/sysdeps/sw_64/divl.S
@@ -0,0 +1,96 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+/* 32-bit signed int divide. This is not a normal C function. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ The FPU can handle all input values except zero. Whee!
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+
+ /*****************************************************************
+ # *
+ # transform to sw-instruct on 2016111216 *
+ # *
+ #****************************************************************/
+
+#ifndef EXTEND
+#define EXTEND(S,D) sextl S, D
+#endif
+
+ .text
+ .align 4
+ .globl __divw
+ .type __divw, @funcnoplt
+ .usepv __divw, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__divw:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f2, 16(sp)
+ fstd $f3, 40(sp)
+ fstd $f4, 48(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f2, 16)
+ cfi_rel_offset ($f3, 40)
+ cfi_rel_offset ($f4, 48)
+
+ rfpcr $f2
+ EXTEND (X, RV)
+ EXTEND (Y, AT)
+ _ITOFT2 RV, $f0, 24, AT, $f1, 32
+ fcvtld $f0, $f3
+ fcvtld $f1, $f4
+ fdivd $f3, $f4, $f1
+ fcvtdl_z $f1, $f0
+ wfpcr $f2
+ _FTOIT $f0, RV, 24
+
+ fldd $f0, 0(sp)
+ fldd $f1, 8(sp)
+ fldd $f2, 16(sp)
+ fldd $f3, 40(sp)
+ fldd $f4, 48(sp)
+ ldi sp, FRAME(sp)
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_def_cfa_offset (0)
+ sextl RV, RV
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size __divw, .-__divw
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/divlu.S b/sysdeps/sw_64/divlu.S
new file mode 100644
index 00000000..26e1842f
--- /dev/null
+++ b/sysdeps/sw_64/divlu.S
@@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D) zapnot S, 15, D
+#define __divw __divwu
+#include <divl.S>
diff --git a/sysdeps/sw_64/divq.S b/sysdeps/sw_64/divq.S
new file mode 100644
index 00000000..61ef58b4
--- /dev/null
+++ b/sysdeps/sw_64/divq.S
@@ -0,0 +1,290 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long divide. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ /*****************************************************************
+ # *
+ # transform to sw-instruct on 2016111216 *
+ # *
+ #****************************************************************/
+ .text
+ .align 4
+ .globl __divl
+ .type __divl, @funcnoplt
+ .usepv __divl, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__divl:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+ rfpcr $f3
+
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if X fit in the double as an exact value. */
+ sll X, (64-53), AT
+ fldd $f1, 8(sp)
+ sra AT, (64-53), AT
+ cmpeq X, AT, AT
+ beq AT, $x_big
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert and clean up. */
+ fcvtdl_z $f0, $f4
+ excb
+
+ wfpcr $f3
+ _FTOIT $f4, RV, 16
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ cfi_restore ($f1)
+ cfi_remember_state
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ldi sp, FRAME(sp)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t0, 32(sp)
+ stl t1, 40(sp)
+ stl t2, 16(sp)
+ stl t5, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t5, 24)
+
+#define Q RV /* quotient */
+#define R t0 /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ /* The fixup code below can only handle unsigned values. */
+ or X, Y, AT
+ mov $31, t5
+ blt AT, $fix_sign_in
+$fix_sign_in_ret1:
+ fcvtdl_z $f0, $f4
+
+ _FTOIT $f4, Q, 8
+ .align 3
+$fix_sign_in_ret2:
+ fldd $f0, 0(sp)
+ stl t3, 0(sp)
+ cfi_restore ($f0)
+ cfi_rel_offset (t3, 0)
+
+ mull Q, Y, QY
+ excb
+ stl t4, 8(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+ bne t5, $fix_sign_out
+
+$fix_sign_out_ret:
+ ldl t3, 0(sp)
+ ldl t4, 8(sp)
+ ldl t5, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore (t5)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+$fix_sign_in:
+ /* If we got here, then X|Y is negative. Need to adjust everything
+ such that we're doing unsigned division in the fixup loop. */
+ /* T5 records the changes we had to make:
+ bit 0: set if result should be negative.
+ bit 2: set if X was negated.
+ bit 3: set if Y was negated.
+ */
+ xor X, Y, AT
+ cmplt AT, 0, t5
+ cmplt X, 0, AT
+ negl X, t0
+
+ s4addl AT, t5, t5
+ selne AT, t0, X, X
+ cmplt Y, 0, AT
+ negl Y, t0
+
+ s8addl AT, t5, t5
+ selne AT, t0, Y, Y
+ unop
+ blbc t5, $fix_sign_in_ret1
+
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ .align 3
+ negl Q, Q
+ br $fix_sign_in_ret2
+
+ .align 4
+$fix_sign_out:
+ /* Now we get to undo what we did above. */
+ /* ??? Is this really faster than just increasing the size of
+ the stack frame and storing X and Y in memory? */
+ and t5, 8, AT
+ negl Y, t4
+ selne AT, t4, Y, Y
+
+ and t5, 4, AT
+ negl X, t4
+ selne AT, t4, X, X
+
+ negl RV, t4
+ sellbs t5, t4, RV, RV
+
+ br $fix_sign_out_ret
+
+ cfi_endproc
+ .size __divl, .-__divl
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/divqu.S b/sysdeps/sw_64/divqu.S
new file mode 100644
index 00000000..7b39201e
--- /dev/null
+++ b/sysdeps/sw_64/divqu.S
@@ -0,0 +1,292 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long divide. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may be
+ clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ /* transform to sw-instruct on 2016111216 */
+ .text
+ .align 4
+ .globl __divlu
+ .type __divlu, @funcnoplt
+ .usepv __divlu, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__divlu:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ beq Y, DIVBYZERO
+ fstd $f0, 0(sp)
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+ stl t0,32(sp)
+ stl t1,40(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+
+ rfpcr $f3
+ /*add it for there has some err when with -mieee of
+ 0xffffffffffffffff/2*/
+ rfpcr $f1
+ fimovd $f1,t0
+ ldi t1,3
+ sll t1,58,t1
+ bic t0,t1,t0
+ ifmovd t0,$f1
+ wfpcr $f1
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+ blt X, $x_is_neg
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if Y was mis-converted as signed value. */
+ fldd $f1, 8(sp)
+ blt Y, $y_is_neg
+
+ /* Check to see if X fit in the double as an exact value. */
+ srl X, 53, AT
+ bne AT, $x_big
+
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert and clean up. */
+ fcvtdl $f0, $f4
+ wfpcr $f3
+ _FTOIT $f4, RV, 16
+
+ ldl t0,32(sp)
+ ldl t1,40(sp)
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+$x_is_neg:
+ /* If we get here, X is so big that bit 63 is set, which made the
+ conversion come out negative. Fix it up lest we not even get
+ a good estimate. */
+ ldih AT, 0x5f80 /* 2**64 as float. */
+ fstd $f2, 24(sp)
+ fstd $f6, 72(sp)
+ cfi_rel_offset ($f2, 24)
+ cfi_rel_offset ($f5, 72)
+ _ITOFS AT, $f2, 16
+
+ .align 4
+ faddd $f4, $f2, $f6
+ unop
+ fdivd $f6, $f5, $f0
+ unop
+
+ /* Ok, we've now the divide issued. Continue with other checks. */
+ fldd $f1, 8(sp)
+ unop
+ fldd $f2, 24(sp)
+ fldd $f6, 72(sp)
+ blt Y, $y_is_neg
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f6)
+ cfi_remember_state /* for y_is_neg */
+
+ .align 4
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t2, 16(sp)
+ stl t3, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t3, 24)
+
+#define Q RV /* quotient */
+#define R t0 /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ fcvtdl $f0, $f4
+ _FTOIT $f4, Q, 8
+ mull Q, Y, QY
+
+ .align 4
+ stl t4, 8(sp)
+ excb
+ fldd $f0, 0(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+ cfi_restore ($f0)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t4, 8(sp)
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+
+ ldl t3, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+ cfi_restore_state
+$y_is_neg:
+ /* If we get here, Y is so big that bit 63 is set. The results
+ from the divide will be completely wrong. Fortunately, the
+ quotient must be either 0 or 1, so just compute it directly. */
+ cmpule Y, X, RV
+ excb
+ wfpcr $f3
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldl t0,32(sp)
+ ldl t1,40(sp)
+ ldi sp, FRAME(sp)
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+ cfi_endproc
+ .size __divlu, .-__divlu
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/htonl.S b/sysdeps/sw_64/htonl.S
new file mode 100644
index 00000000..7fc0aa24
--- /dev/null
+++ b/sysdeps/sw_64/htonl.S
@@ -0,0 +1,43 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(htonl)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ .set noat
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set at
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ ins6b a0, 7, t0 # t0 = 0000000000AABBCC
+ ins1b a0, 3, t1 # t1 = 000000CCDD000000
+ or t1, t0, t1 # t1 = 000000CCDDAABBCC
+ srl t1, 16, t2 # t2 = 0000000000CCDDAA
+ zapnot t1, 0x0A, t0 # t0 = 00000000DD00BB00
+ zapnot t2, 0x05, t3 # t3 = 0000000000CC00AA
+ addw t0, t3, v0 # v0 = ssssssssDDCCBBAA
+ ret
+
+ END(htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/sysdeps/sw_64/htons.S b/sysdeps/sw_64/htons.S
new file mode 100644
index 00000000..8a981be1
--- /dev/null
+++ b/sysdeps/sw_64/htons.S
@@ -0,0 +1,39 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(htons)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ .set noat
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set at
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ ext5b a0, 7, t1 # t1 = bb00
+ ext0b a0, 1, v0 # v0 = 00aa
+ bis v0, t1, v0 # v0 = bbaa
+ ret
+
+ END(htons)
+
+weak_alias (htons, ntohs)
diff --git a/sysdeps/sw_64/ldiv.S b/sysdeps/sw_64/ldiv.S
new file mode 100644
index 00000000..7a77d6dd
--- /dev/null
+++ b/sysdeps/sw_64/ldiv.S
@@ -0,0 +1,222 @@
+
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Richard Henderson <rth@tamu.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include "div_libc.h"
+
+#undef FRAME
+#ifdef __sw_64_fix__
+#define FRAME 0
+#else
+#define FRAME 16
+#endif
+
+#undef X
+#undef Y
+#define X $17
+#define Y $18
+
+ .set noat
+
+ .align 4
+ .globl ldiv
+ .ent ldiv
+ldiv:
+ .frame sp, FRAME, ra
+#if FRAME > 0
+ ldi sp, -FRAME(sp)
+#endif
+#ifdef PROF
+ .set macro
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set nomacro
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ beq Y, $divbyzero
+ mov Y,t6
+ nop
+ rfpcr $f10
+
+ _ITOFT2 X, $f0, 0, Y, $f1, 8
+
+ .align 4
+ fcvtld $f0, $f11
+ fcvtld $f1, $f12
+ fdivd $f11, $f12, $f0
+ unop
+
+ /* Check to see if X fit in the double as an exact value. */
+ sll X, (64-53), AT
+ sra AT, (64-53), AT
+ cmpeq X, AT, AT
+ beq AT, $x_big
+
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert and clean up. */
+ fcvtdl_z $f0, $f11
+ nop
+ wfpcr $f10
+ _FTOIT $f11, $0, 0
+
+$egress:
+// mull $0, Y, $1
+ mull $0, t6, $1
+ subl X, $1, $1
+
+ stl $0, 0($16)
+ stl $1, 8($16)
+ mov $16, $0
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+ .align 4
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+
+#define Q v0 /* quotient */
+#define R t0 /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ /* The fixup code below can only handle unsigned values. */
+ bis X, Y, AT
+ mov $31, t5
+ blt AT, $fix_sign_in
+$fix_sign_in_ret1:
+ fcvtdl_z $f0, $f11
+
+ _FTOIT $f11, Q, 8
+$fix_sign_in_ret2:
+ mull Q, Y, QY
+ nop
+ wfpcr $f10
+
+ .align 4
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ negl Q, t4
+ sellbs t5, t4, Q, Q
+ br $egress
+
+ .align 4
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+$fix_sign_in:
+ /* If we got here, then X|Y is negative. Need to adjust everything
+ such that we're doing unsigned division in the fixup loop. */
+ /* T5 is true if result should be negative. */
+ xor X, Y, AT
+ cmplt AT, 0, t5
+ cmplt X, 0, AT
+ negl X, t0
+
+ selne AT, t0, X, X
+ cmplt Y, 0, AT
+ negl Y, t0
+
+ selne AT, t0, Y, Y
+ blbc t5, $fix_sign_in_ret1
+
+ fcvtdl_z $f0, $f11
+ _FTOIT $f11, Q, 8
+ .align 3
+ negl Q, Q
+ br $fix_sign_in_ret2
+
+$divbyzero:
+ mov a0, v0
+ ldi a0, GEN_INTDIV
+ sys_call HMC_gentrap
+ stl zero, 0(v0)
+ stl zero, 8(v0)
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+ .end ldiv
+
+weak_alias (ldiv, lldiv)
+weak_alias (ldiv, imaxdiv)
diff --git a/sysdeps/sw_64/lldiv.S b/sysdeps/sw_64/lldiv.S
new file mode 100644
index 00000000..8a8ef97a
--- /dev/null
+++ b/sysdeps/sw_64/lldiv.S
@@ -0,0 +1 @@
+/* lldiv is the same as ldiv on the Sw_64. */
diff --git a/sysdeps/sw_64/lshift.S b/sysdeps/sw_64/lshift.S
new file mode 100644
index 00000000..700e9d80
--- /dev/null
+++ b/sysdeps/sw_64/lshift.S
@@ -0,0 +1,107 @@
+ # Sw_64 1621 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldl and stl can be paired with the other used
+ # instructions. But there are many restrictions in the 1621 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $17,8,$17
+ subl $31,$19,$7
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ srl $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subl $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldl $3,-8($17)
+ subl $16,8,$16
+ subl $17,8,$17
+ subl $20,1,$20
+ sll $4,$19,$5
+ srl $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stl $8,0($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldl $3,-8($17)
+ subl $16,32,$16
+ subl $18,4,$18
+ sll $4,$19,$5
+ srl $3,$7,$6
+
+ ldl $4,-16($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stl $8,24($16)
+ srl $4,$7,$2
+
+ ldl $3,-24($17)
+ sll $4,$19,$5
+ bis $1,$2,$8
+ stl $8,16($16)
+ srl $3,$7,$6
+
+ ldl $4,-32($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stl $8,8($16)
+ srl $4,$7,$2
+
+ subl $17,32,$17
+ bis $1,$2,$8
+ stl $8,0($16)
+
+ bgt $18,.Loop
+
+.Lend: sll $4,$19,$8
+ stl $8,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/mul_1.S b/sysdeps/sw_64/mul_1.S
new file mode 100644
index 00000000..127f4274
--- /dev/null
+++ b/sysdeps/sw_64/mul_1.S
@@ -0,0 +1,82 @@
+ # Sw_64 1621 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+
+ # To improve performance for long fmuldiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Sw_64
+ # architecture. 2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR. Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_mul_1
+ .ent __mpn_mul_1 2
+__mpn_mul_1:
+ .frame $30,0,$26
+
+ ldl $2,0($17) # $2 = s1_limb
+ subl $18,1,$18 # size--
+ mull $2,$19,$3 # $3 = prod_low
+ bic $31,$31,$4 # clear cy_limb
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,Lend1 # jump if size was == 1
+ ldl $2,8($17) # $2 = s1_limb
+ subl $18,1,$18 # size--
+ stl $3,0($16)
+ beq $18,Lend2 # jump if size was == 2
+
+ .align 3
+Loop: mull $2,$19,$3 # $3 = prod_low
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subl $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldl $2,16($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ stl $3,8($16)
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addl $16,8,$16 # res_ptr++
+ bne $18,Loop
+
+Lend2: mull $2,$19,$3 # $3 = prod_low
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ stl $3,8($16)
+ addl $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+Lend1: stl $3,0($16)
+ ret $31,($26),1
+
+ .end __mpn_mul_1
diff --git a/sysdeps/sw_64/reml.S b/sysdeps/sw_64/reml.S
new file mode 100644
index 00000000..56a550d9
--- /dev/null
+++ b/sysdeps/sw_64/reml.S
@@ -0,0 +1,93 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@twiddle.net>
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+/* 32-bit signed int remainder. This is not a normal C function. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ The FPU can handle the division for all input values except zero.
+ All we have to do is compute the remainder via multiply-and-subtract.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ /*__reml->__remw 20161111*/
+#ifndef EXTEND
+#define EXTEND(S,D) sextl S, D
+#endif
+
+ .text
+ .align 4
+ .globl __remw
+ .type __remw, @funcnoplt
+ .usepv __remw, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__remw:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f2, 16(sp)
+ fstd $f3, 40(sp)
+ fstd $f4, 48(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f2, 16)
+ cfi_rel_offset ($f3, 40)
+ cfi_rel_offset ($f4, 48)
+
+ rfpcr $f2
+ EXTEND (X, RV)
+ EXTEND (Y, AT)
+ _ITOFT2 RV, $f0, 24, AT, $f1, 32
+ fcvtld $f0, $f3
+ fcvtld $f1, $f4
+ fdivd $f3, $f4, $f0
+ fcvtdl_z $f0, $f3
+
+ wfpcr $f2
+ _FTOIT $f3, RV, 24
+ fldd $f0, 0(sp)
+ mulw RV, Y, RV
+ fldd $f1, 8(sp)
+ fldd $f2, 16(sp)
+ fldd $f3, 40(sp)
+ fldd $f4, 48(sp)
+ ldi sp, FRAME(sp)
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_def_cfa_offset (0)
+ subw X, RV, RV
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size __remw, .-__remw
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/remlu.S b/sysdeps/sw_64/remlu.S
new file mode 100644
index 00000000..3c12f7bf
--- /dev/null
+++ b/sysdeps/sw_64/remlu.S
@@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D) zapnot S, 15, D
+#define __remw __remwu
+#include <reml.S>
diff --git a/sysdeps/sw_64/remq.S b/sysdeps/sw_64/remq.S
new file mode 100644
index 00000000..6db7f628
--- /dev/null
+++ b/sysdeps/sw_64/remq.S
@@ -0,0 +1,274 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long remainder. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ .text
+ .align 4
+ .globl __reml
+ .type __reml, @funcnoplt
+ .usepv __reml, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__reml:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+
+ rfpcr $f3
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if X fit in the double as an exact value. */
+ sll X, (64-53), AT
+ fldd $f1, 8(sp)
+ sra AT, (64-53), AT
+ cmpeq X, AT, AT
+ beq AT, $x_big
+ fcvtdl_z $f0, $f4
+
+ wfpcr $f3
+ _FTOIT $f4, AT, 16
+ mull AT, Y, AT
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ cfi_restore ($f1)
+ cfi_remember_state
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ldi sp, FRAME(sp)
+ subl X, AT, RV
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t0, 32(sp)
+ stl t1, 40(sp)
+ stl t2, 16(sp)
+ stl t5, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t5, 24)
+
+#define Q t0 /* quotient */
+#define R RV /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ /* The fixup code below can only handle unsigned values. */
+ or X, Y, AT
+ mov $31, t5
+ blt AT, $fix_sign_in
+$fix_sign_in_ret1:
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ .align 3
+$fix_sign_in_ret2:
+ fldd $f0, 0(sp)
+ stl t3, 0(sp)
+ cfi_restore ($f0)
+ cfi_rel_offset (t3, 0)
+
+ mull Q, Y, QY
+ stl t4, 8(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+ bne t5, $fix_sign_out
+
+$fix_sign_out_ret:
+ ldl t3, 0(sp)
+ ldl t4, 8(sp)
+ ldl t5, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore (t5)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+$fix_sign_in:
+ /* If we got here, then X|Y is negative. Need to adjust everything
+ such that we're doing unsigned division in the fixup loop. */
+ /* T5 records the changes we had to make:
+ bit 0: set if X was negated. Note that the sign of the
+ remainder follows the sign of the divisor.
+ bit 2: set if Y was negated.
+ */
+ xor X, Y, t1
+ cmplt X, 0, t5
+ negl X, t0
+ selne t5, t0, X, X
+
+ cmplt Y, 0, AT
+ negl Y, t0
+ s4addl AT, t5, t5
+ selne AT, t0, Y, Y
+
+ bge t1, $fix_sign_in_ret1
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ .align 3
+ negl Q, Q
+ br $fix_sign_in_ret2
+
+ .align 4
+$fix_sign_out:
+ /* Now we get to undo what we did above. */
+ /* ??? Is this really faster than just increasing the size of
+ the stack frame and storing X and Y in memory? */
+ and t5, 4, AT
+ negl Y, t4
+ selne AT, t4, Y, Y
+
+ negl X, t4
+ sellbs t5, t4, X, X
+ negl RV, t4
+ sellbs t5, t4, RV, RV
+
+ br $fix_sign_out_ret
+
+ cfi_endproc
+ .size __reml, .-__reml
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/remqu.S b/sysdeps/sw_64/remqu.S
new file mode 100644
index 00000000..946e031b
--- /dev/null
+++ b/sysdeps/sw_64/remqu.S
@@ -0,0 +1,292 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long remainder. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may be
+ clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ .text
+ .align 4
+ .globl __remlu
+ .type __remlu, @funcnoplt
+ .usepv __remlu, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__remlu:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ subl Y, 1, AT
+ and Y, AT, AT
+ beq AT, $powerof2
+ fstd $f0, 0(sp)
+
+
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+
+ rfpcr $f3
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+
+ blt X, $x_is_neg
+setfpec1
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if Y was mis-converted as signed value. */
+ fldd $f1, 8(sp)
+ blt Y, $y_is_neg
+
+ /* Check to see if X fit in the double as an exact value. */
+ srl X, 53, AT
+ bne AT, $x_big
+
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert, compute remainder, clean up. */
+ fcvtdl_z $f0, $f4
+ wfpcr $f3
+ _FTOIT $f4, AT, 16
+ mull AT, Y, AT
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+
+ .align 4
+ subl X, AT, RV
+ ret $31, (RA), 1
+ .align 4
+ cfi_restore_state
+$x_is_neg:
+ /* If we get here, X is so big that bit 63 is set, which made the
+ conversion come out negative. Fix it up lest we not even get
+ a good estimate. */
+ ldih AT, 0x5f80 /* 2**64 as float. */
+ fstd $f2, 24(sp)
+ fstd $f6, 72(sp)
+ cfi_rel_offset ($f2, 24)
+ cfi_rel_offset ($f6, 72)
+ _ITOFS AT, $f2, 16
+ .align 4
+ faddd $f4, $f2, $f6
+ fdivd $f6, $f5, $f0
+
+ /* Ok, we've now the divide issued. Continue with other checks. */
+# .align 4
+ fldd $f1, 8(sp)
+ unop
+ fldd $f2, 24(sp)
+ fldd $f6, 72(sp)
+ blt Y, $y_is_neg
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f6)
+ cfi_remember_state /* for y_is_neg */
+
+ .align 4
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t0, 32(sp)
+ stl t1, 40(sp)
+ stl t2, 16(sp)
+ stl t3, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t3, 24)
+
+#define Q t0 /* quotient */
+#define R RV /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ mull Q, Y, QY
+
+ .align 4
+ stl t4, 8(sp)
+ excb
+ fldd $f0, 0(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+ cfi_restore ($f0)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t4, 8(sp)
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+
+ ldl t3, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+ cfi_restore_state
+$y_is_neg:
+ /* If we get here, Y is so big that bit 63 is set. The results
+ from the divide will be completely wrong. Fortunately, the
+ quotient must be either 0 or 1, so the remainder must be X
+ or X-Y, so just compute it directly. */
+ cmpule Y, X, AT
+ nop
+ wfpcr $f3
+ subl X, Y, RV
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ seleq AT, X, RV, RV
+
+ ldi sp, FRAME(sp)
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+ .align 4
+ cfi_def_cfa_offset (FRAME)
+$powerof2:
+ subl Y, 1, AT
+ beq Y, DIVBYZERO
+ and X, AT, RV
+ ldi sp, FRAME(sp)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size __remlu, .-__remlu
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/rshift.S b/sysdeps/sw_64/rshift.S
new file mode 100644
index 00000000..81b3d742
--- /dev/null
+++ b/sysdeps/sw_64/rshift.S
@@ -0,0 +1,105 @@
+ # Sw_64 1621 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldl and stl can be paired with the other used
+ # instructions. But there are many restrictions in the 1621 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ addl $17,8,$17
+ subl $31,$19,$7
+ subl $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ sll $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subl $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldl $3,0($17)
+ addl $16,8,$16
+ addl $17,8,$17
+ subl $20,1,$20
+ srl $4,$19,$5
+ sll $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stl $8,-8($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldl $3,0($17)
+ addl $16,32,$16
+ subl $18,4,$18
+ srl $4,$19,$5
+ sll $3,$7,$6
+
+ ldl $4,8($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stl $8,-32($16)
+ sll $4,$7,$2
+
+ ldl $3,16($17)
+ srl $4,$19,$5
+ bis $1,$2,$8
+ stl $8,-24($16)
+ sll $3,$7,$6
+
+ ldl $4,24($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stl $8,-16($16)
+ sll $4,$7,$2
+
+ addl $17,32,$17
+ bis $1,$2,$8
+ stl $8,-8($16)
+
+ bgt $18,.Loop
+
+.Lend: srl $4,$19,$8
+ stl $8,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sub_n.S b/sysdeps/sw_64/sub_n.S
new file mode 100644
index 00000000..d0d5a30c
--- /dev/null
+++ b/sysdeps/sw_64/sub_n.S
@@ -0,0 +1,118 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ ldl $3,0($17)
+ ldl $4,0($18)
+
+ subl $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop
+
+ subl $19,$2,$19
+
+.Loop0: subl $2,1,$2
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ addl $17,8,$17
+ addl $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addl $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subl $19,4,$19
+
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ ldl $3,16($17)
+ addl $6,$0,$6
+ ldl $4,16($18)
+ cmpult $6,$0,$1
+ subl $5,$6,$6
+ cmpult $5,$6,$0
+ stl $6,8($16)
+ or $0,$1,$0
+
+ ldl $5,24($17)
+ addl $4,$0,$4
+ ldl $6,24($18)
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,16($16)
+ or $0,$1,$0
+
+ ldl $3,32($17)
+ addl $6,$0,$6
+ ldl $4,32($18)
+ cmpult $6,$0,$1
+ subl $5,$6,$6
+ cmpult $5,$6,$0
+ stl $6,24($16)
+ or $0,$1,$0
+
+ addl $17,32,$17
+ addl $18,32,$18
+ addl $16,32,$16
+ bne $19,.Loop
+
+.Lend: addl $4,$0,$4
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/submul_1.S b/sysdeps/sw_64/submul_1.S
new file mode 100644
index 00000000..2cad2bef
--- /dev/null
+++ b/sysdeps/sw_64/submul_1.S
@@ -0,0 +1,89 @@
+ # Sw_64 1621 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # fsubdract the result from a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_submul_1
+ .ent __mpn_submul_1 2
+__mpn_submul_1:
+ .frame $30,0,$26
+
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ subl $5,$3,$3
+ cmpult $5,$3,$4
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subl $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ subl $5,$3,$3
+ cmpult $5,$3,$5
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ addl $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ subl $5,$3,$3
+ cmpult $5,$3,$5
+ stl $3,0($16)
+ addl $5,$0,$0 # combine carries
+ addl $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: subl $5,$3,$3
+ cmpult $5,$3,$5
+ stl $3,0($16)
+ addl $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_submul_1
diff --git a/sysdeps/sw_64/sw6a/add_n.S b/sysdeps/sw_64/sw6a/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ addl $0,$4,$20 # 1st main add
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ addl $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ addl $28,$6,$22 # 3rd main add
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ addl $4,$28,$20 # 1st main add
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ addl $5,$28,$21 # 2nd main add
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ addl $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ addl $4,$28,$20 # main add
+ ldl $4,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ addl $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/sw6a/addmul_1.S b/sysdeps/sw_64/sw6a/addmul_1.S
new file mode 100644
index 00000000..287e8573
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # . Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldl in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
+ # like not to have a ldl or stl to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ ldi $30, -240($30)
+ stl $9, 8($30)
+ stl $10, 16($30)
+ stl $11, 24($30)
+ stl $12, 32($30)
+ stl $13, 40($30)
+ stl $14, 48($30)
+ stl $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $0, $5, $0
+
+$Lunroll:
+ ldi $17, -16($17) # L1 bookkeeping
+ ldi $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldl $2, 16($17) # L1
+ ldl $3, 24($17) # L1
+ ldi $18, -1($18) # L1 bookkeeping
+ ldl $6, 16($16) # L1
+ ldl $7, 24($16) # L1
+ ldl $0, 32($17) # L1
+ mull $19, $2, $13 # U1
+ ldl $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mull $19, $3, $15 # U1
+ ldi $17, 64($17) # L1 bookkeeping
+ ldl $4, 32($16) # L1
+ ldl $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldl $2, -16($17) # L1
+ mull $19, $0, $9 # U1
+ ldl $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ mull $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+ mull $19, $3, $15 # U1
+ addl $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, 16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $18, -1($18) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 32($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $17, 64($17) # L1 bookkeeping
+ addl $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, -16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+ mull $19, $1, $11 # U1
+ addl $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+ addl $12, $21, $0 # U0 hi mul + carry
+
+ ldl $9, 8($30)
+ ldl $10, 16($30)
+ ldl $11, 24($30)
+ ldl $12, 32($30)
+ ldl $13, 40($30)
+ ldl $14, 48($30)
+ ldl $15, 56($30)
+ ldi $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/sw6a/lshift.S b/sysdeps/sw_64/sw6a/lshift.S
new file mode 100644
index 00000000..cc00593c
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $31,$19,$20
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,-16($17)
+ subl $16,8,$16
+ sll $4,$19,$5
+ subl $17,8,$17
+ subl $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,-16($17)
+ subl $18,4,$18
+ ldl $2,-24($17)
+ ldl $3,-32($17)
+ ldl $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldl $1,-48($17)
+ sll $2,$19,$22
+ ldl $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldl $3,-64($17)
+ sll $4,$19,$24
+ ldl $4,-72($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subl $18,4,$18
+ sll $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ srl $2,$20,$8
+ ldl $1,-80($17)
+ sll $2,$19,$22
+ ldl $2,-88($17)
+
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldl $31,-96($17)
+ sll $3,$19,$23
+ subl $16,32,$16
+
+ srl $4,$20,$6
+ ldl $3,-96($17)
+ sll $4,$19,$24
+ ldl $4,-104($17)
+
+ subl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stl $7,-40($16)
+ or $5,$22,$5
+ stl $8,-48($16)
+ or $6,$23,$6
+ stl $5,-56($16)
+ stl $6,-64($16)
+ # cool down phase 2/3
+ stl $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ stl $5,-24($16)
+ stl $6,-32($16)
+ stl $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stl $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/sw6a/rshift.S b/sysdeps/sw_64/sw6a/rshift.S
new file mode 100644
index 00000000..416c3903
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ subl $31,$19,$20
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,8($17)
+ addl $16,8,$16
+ srl $4,$19,$5
+ addl $17,8,$17
+ subl $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,8($17)
+ subl $18,4,$18
+ ldl $2,16($17)
+ ldl $3,24($17)
+ ldl $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldl $1,40($17)
+ srl $2,$19,$22
+ ldl $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldl $3,56($17)
+ srl $4,$19,$24
+ ldl $4,64($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subl $18,4,$18
+ srl $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ sll $2,$20,$8
+ ldl $1,72($17)
+ srl $2,$19,$22
+ ldl $2,80($17)
+
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldl $31,-96($17)
+ srl $3,$19,$23
+ addl $16,32,$16
+
+ sll $4,$20,$6
+ ldl $3,88($17)
+ srl $4,$19,$24
+ ldl $4,96($17)
+
+ addl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stl $7,32($16)
+ or $5,$22,$5
+ stl $8,40($16)
+ or $6,$23,$6
+ stl $5,48($16)
+ stl $6,56($16)
+ # cool down phase 2/3
+ stl $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ stl $5,16($16)
+ stl $6,24($16)
+ stl $24,32($16)
+ ret $31,($26),1
+
+.Lend: stl $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sw6a/sub_n.S b/sysdeps/sw_64/sw6a/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ subl $4,$0,$20 # 1st main sub
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ subl $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ subl $6,$28,$22 # 3rd main sub
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ subl $4,$28,$20 # 1st main sub
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ subl $5,$28,$21 # 2nd main sub
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ subl $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ subl $4,$28,$20 # main sub
+ ldl $1,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ subl $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/sw6b/add_n.S b/sysdeps/sw_64/sw6b/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ addl $0,$4,$20 # 1st main add
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ addl $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ addl $28,$6,$22 # 3rd main add
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ addl $4,$28,$20 # 1st main add
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ addl $5,$28,$21 # 2nd main add
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ addl $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ addl $4,$28,$20 # main add
+ ldl $4,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ addl $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/sw6b/addmul_1.S b/sysdeps/sw_64/sw6b/addmul_1.S
new file mode 100644
index 00000000..a288f040
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # . Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldl in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
+ # like not to have a ldl or stl to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ ldi $30, -240($30)
+ stl $9, 8($30)
+ stl $10, 16($30)
+ stl $11, 24($30)
+ stl $12, 32($30)
+ stl $13, 40($30)
+ stl $14, 48($30)
+ stl $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $0, $5, $0
+
+$Lunroll:
+ ldi $17, -16($17) # L1 bookkeeping
+ ldi $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldl $2, 16($17) # L1
+ ldl $3, 24($17) # L1
+ ldi $18, -1($18) # L1 bookkeeping
+ ldl $6, 16($16) # L1
+ ldl $7, 24($16) # L1
+ ldl $0, 32($17) # L1
+ mull $19, $2, $13 # U1
+ ldl $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mull $19, $3, $15 # U1
+ ldi $17, 64($17) # L1 bookkeeping
+ ldl $4, 32($16) # L1
+ ldl $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldl $2, -16($17) # L1
+ mull $19, $0, $9 # U1
+ ldl $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ mull $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+ mull $19, $3, $15 # U1
+ addl $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, 16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $18, -1($18) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 32($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $17, 64($17) # L1 bookkeeping
+ addl $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, -16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+ mull $19, $1, $11 # U1
+ addl $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+ addl $12, $21, $0 # U0 hi mul + carry
+
+ ldl $9, 8($30)
+ ldl $10, 16($30)
+ ldl $11, 24($30)
+ ldl $12, 32($30)
+ ldl $13, 40($30)
+ ldl $14, 48($30)
+ ldl $15, 56($30)
+ ldi $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/sw6b/lshift.S b/sysdeps/sw_64/sw6b/lshift.S
new file mode 100644
index 00000000..cc00593c
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $31,$19,$20
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,-16($17)
+ subl $16,8,$16
+ sll $4,$19,$5
+ subl $17,8,$17
+ subl $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,-16($17)
+ subl $18,4,$18
+ ldl $2,-24($17)
+ ldl $3,-32($17)
+ ldl $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldl $1,-48($17)
+ sll $2,$19,$22
+ ldl $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldl $3,-64($17)
+ sll $4,$19,$24
+ ldl $4,-72($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subl $18,4,$18
+ sll $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ srl $2,$20,$8
+ ldl $1,-80($17)
+ sll $2,$19,$22
+ ldl $2,-88($17)
+
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldl $31,-96($17)
+ sll $3,$19,$23
+ subl $16,32,$16
+
+ srl $4,$20,$6
+ ldl $3,-96($17)
+ sll $4,$19,$24
+ ldl $4,-104($17)
+
+ subl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stl $7,-40($16)
+ or $5,$22,$5
+ stl $8,-48($16)
+ or $6,$23,$6
+ stl $5,-56($16)
+ stl $6,-64($16)
+ # cool down phase 2/3
+ stl $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ stl $5,-24($16)
+ stl $6,-32($16)
+ stl $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stl $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/sw6b/memcpy.S b/sysdeps/sw_64/sw6b/memcpy.S
new file mode 100644
index 00000000..938ebdfc
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/memcpy.S
@@ -0,0 +1,416 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ sw6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/*
+ * Much of the information about 21264 scheduling/coding comes from:
+ * Compiler Writer's Guide for the Sw_64 21264
+ * abbreviated as 'CWG' in other comments here
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ * E - either cluster
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ * $0 - destination address
+ * $1,$2, - scratch
+ */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ .type $jmppointh,@object
+$jumppointh:
+ .gprel32 $both_0mod8
+ .gprel32 J$H01
+ .gprel32 J$H02
+ .gprel32 J$H03
+ .gprel32 J$H04
+ .gprel32 J$H05
+ .gprel32 J$H06
+ .gprel32 J$H07
+
+ENTRY(memcpy)
+ .prologue 1
+ ldgp $29, 0($27)
+ mov $16, $0 # E : copy dest to return
+ ble $18, $nomoredata # U : done with the copy?
+ cmplt $18, 8, $1
+ bne $1, $less_8
+ xor $16, $17, $1 # E : are source and dest alignments the same?
+ and $1, 7, $1 # E : are they the same mod 8?
+
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
+ /* source and dest are same mod 8 address */
+ and $16, 7, $1 # E : Are both 0mod8?
+ beq $1, $both_0mod8 # U : Yes
+ nop # E :
+
+ /*
+ * source and dest are same misalignment. move a byte at a time
+ * until a 0mod8 alignment for both is reached.
+ * At least one byte more to move
+ */
+
+ ldi $2, 8
+ subl $2, $1, $1
+
+$head_align:
+ addl $16, $1, $16
+ addl $17, $1, $17
+ subl $18, $1, $18
+ ldih $2, $jumppointh($29) !gprelhigh
+ s4addl $1, $2, $2
+ ldw $2, $jumppointh($2) !gprellow
+ addl $2, $29, $2
+ jmp ($2)
+
+$both_0mod8:
+ cmple $18, 127, $1 # E : Can we unroll the loop?
+ bne $1, $no_unroll # U :
+ and $16, 63, $1 # E : get mod64 alignment
+ beq $1, $do_unroll # U : no single quads to fiddle
+
+$single_head_quad:
+ ldl $1, 0($17) # L : get 8 bytes
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stl $1, 0($16) # L : store
+ addl $16, 8, $16 # E : dest += 8
+ and $16, 63, $1 # E : get mod64 alignment
+ bne $1, $single_head_quad # U : still not fully aligned
+
+$do_unroll:
+ ldih $1, 8($31) # big than 512K
+ cmple $18, $1, $1
+ beq $1, $unroll_body_512
+ nop
+ nop
+ cmple $18, 63, $1 # E : Can we go through the unrolled loop?
+ bne $1, $tail_quads # U : Nope
+ nop # E :
+
+$unroll_body:
+ ldl $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldl $4, 8($17) # L : bytes 8..15
+ ldl $5, 16($17) # L : bytes 16..23
+ nop # E :
+ nop # E :
+
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addl $17, 32, $17 # E : src += 32 bytes
+ stl $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stl $4, 8($16) # L : bytes 8..15
+ stl $5, 16($16) # L : bytes 16..23
+ subl $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stl $3, 24($16) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldl $6, 0($17) # L : bytes 0..7
+ ldl $4, 8($17) # L : bytes 8..15
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+ nop # E :
+
+ ldl $5, 16($17) # L : bytes 16..23
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32
+ subl $18, 64, $18 # E : count -= 64
+
+ addl $17, 32, $17 # E : src += 32
+ stl $6, -32($16) # L : bytes 0..7
+ stl $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stl $5, -16($16) # L : bytes 16..23
+ stl $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body
+ nop
+ nop
+ nop
+ br $tail_quads
+
+$unroll_body_512:
+ fillcs 128*4($17)
+ e_fillcs 128*20($17)
+
+ fillcs 128*3($16) #add by ZJ20220620 stl_nc->stl
+ e_fillcs 128*7($16)
+
+ ldl $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldl $4, 8($17) # L : bytes 8..15
+ ldl $5, 16($17) # L : bytes 16..23
+ nop # E :
+ nop # E :
+
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addl $17, 32, $17 # E : src += 32 bytes
+ stl $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stl $4, 8($16) # L : bytes 8..15
+ stl $5, 16($16) # L : bytes 16..23
+ subl $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stl $3, 24($16) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldl $6, 0($17) # L : bytes 0..7
+ ldl $4, 8($17) # L : bytes 8..15
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+ nop # E :
+
+ ldl $5, 16($17) # L : bytes 16..23
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32
+ subl $18, 64, $18 # E : count -= 64
+
+ addl $17, 32, $17 # E : src += 32
+ stl $6, -32($16) # L : bytes 0..7
+ stl $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stl $5, -16($16) # L : bytes 16..23
+ stl $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body_512
+
+$tail_quads:
+$no_unroll:
+ .align 4
+ subl $18, 8, $18 # E : At least a quad left?
+ blt $18, $less_than_8 # U : Nope
+ nop # E :
+ nop # E :
+
+$move_a_quad:
+ ldl $1, 0($17) # L : fetch 8
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stl $1, 0($16) # L : store 8
+ addl $16, 8, $16 # E : dest += 8
+ bge $18, $move_a_quad # U :
+ nop # E :
+
+$less_than_8:
+ .align 4
+ addl $18, 8, $18 # E : add back for trailing bytes
+ ble $18, $nomoredata # U : All-done
+ nop # E :
+ nop # E :
+
+ /* Trailing bytes */
+$tail_bytes:
+ subl $18, 1, $18 # E : count--
+ ldbu $1, 0($17) # L : fetch a byte
+ addl $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store a byte
+ addl $16, 1, $16 # E : dest++
+ bgt $18, $tail_bytes # U : more to be done?
+ nop # E :
+
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+$misaligned:
+ mov $0, $4 # E : dest temp
+ and $0, 7, $1 # E : dest alignment mod8
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
+ nop
+
+$aligndest:
+ ble $18, $nomoredata # U :
+ ldbu $1, 0($17) # L : fetch a byte
+ subl $18, 1, $18 # E : count--
+ addl $17, 1, $17 # E : src++
+
+ stb $1, 0($4) # L : store it
+ addl $4, 1, $4 # E : dest++
+ and $4, 7, $1 # E : dest 0mod8 yet?
+ bne $1, $aligndest # U : go until we are aligned.
+
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+ subl $18, 8, $18 # E : At least a quad left?
+ blt $18, $misalign_tail # U : Nope
+ ldl_u $3, 0($17) # L : seed (rotating load) of 8 bytes
+ ldih $1, 8($31)
+ subl $1, 8, $1
+ cmple $18, $1, $1
+ beq $1, $mis_quad_big # big than 512K
+
+$mis_quad:
+ ldl_u $16, 8($17) # L : Fetch next 8
+ ext3b $3, $17, $3 # U : masking
+ ext7b $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ stl $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addl $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad # U : More quads to move
+ nop
+ nop
+ nop
+ br $misalign_tail
+
+$mis_quad_big:
+ fillcs 128*4($17)
+ e_fillcs 128*20($17)
+ ldl_u $16, 8($17) # L : Fetch next 8
+ ext3b $3, $17, $3 # U : masking
+ ext7b $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ fillcs 128*9($17) #add by ZJ20220620 stl_nc->stl
+ e_fillcs 128*15($17)
+
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ stl $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addl $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad_big # U : More quads to move
+ nop
+ nop
+
+$misalign_tail:
+ addl $18, 8, $18 # E : account for tail stuff
+ ble $18, $nomoredata # U :
+ nop
+ nop
+
+$misalign_byte:
+ ldbu $1, 0($17) # L : fetch 1
+ subl $18, 1, $18 # E : count--
+ addl $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($4) # L : store
+ addl $4, 1, $4 # E : dest++
+ bgt $18, $misalign_byte # U : more to go?
+ nop
+ br $nomoredata
+
+$less_8:
+ ldbu $1, 0($17) # L : fetch 1
+ subl $18, 1, $18 # E : count--
+ addl $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store
+ addl $16, 1, $16 # E : dest++
+ bgt $18, $less_8 # U : more to go?
+ nop
+
+$nomoredata:
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+J$H01:
+ ldbu $1,-1($17)
+ stb $1,-1($16)
+ br $both_0mod8
+
+J$H02:
+ ldh $1,-2($17)
+ sth $1,-2($16)
+ br $both_0mod8
+
+J$H03:
+ ldh $1,-2($17)
+ ldbu $2,-3($17)
+ sth $1,-2($16)
+ stb $2,-3($16)
+ br $both_0mod8
+
+J$H04:
+ ldw $1,-4($17)
+ stw $1,-4($16)
+ br $both_0mod8
+
+J$H05:
+ ldw $1,-4($17)
+ ldbu $2,-5($17)
+ stw $1,-4($16)
+ stb $2,-5($16)
+ br $both_0mod8
+
+J$H06:
+ ldw $1,-4($17)
+ ldh $2,-6($17)
+ stw $1,-4($16)
+ sth $2,-6($16)
+ br $both_0mod8
+
+J$H07:
+ ldw $1,-4($17)
+ ldh $2,-6($17)
+ ldbu $3,-7($17)
+ stw $1,-4($16)
+ sth $2,-6($16)
+ stb $3,-7($16)
+ br $both_0mod8
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/sw_64/sw6b/memset.S b/sysdeps/sw_64/sw6b/memset.S
new file mode 100644
index 00000000..0085ac70
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/memset.S
@@ -0,0 +1,312 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+ .arch sw6b
+ .set noat
+ .set noreorder
+
+ENTRY(memset)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ /*
+ * Serious stalling happens. The only way to mitigate this is to
+ * undertake a major re-write to interleave the constant materialization
+ * with other parts of the fall-through code. This is important, even
+ * though it makes maintenance tougher.
+ * Do this later.
+ */
+ and $17, 255, $1 # E : 00000000000000ch
+ ins0b $17, 1, $2 # U : 000000000000ch00
+ mov $16, $0 # E : return value
+ mov $17, $8 # E : Save the ch
+ ble $18, $end # U : zero length requested?
+
+ addl $18, $16, $6 # E : max address to write to
+ or $1, $2, $17 # E : 000000000000chch
+ ins0b $1, 2, $3 # U : 0000000000ch0000
+ ins0b $1, 3, $4 # U : 00000000ch000000
+
+ or $3, $4, $3 # E : 00000000chch0000
+ ins1b $17, 4, $5 # U : 0000chch00000000
+ xor $16, $6, $1 # E : will complete write be within one quadword?
+ ins1b $17, 6, $2 # U : chch000000000000
+
+ or $17, $3, $17 # E : 00000000chchchch
+ or $2, $5, $2 # E : chchchch00000000
+ bic $1, 7, $1 # E : fit within a single quadword?
+ and $16, 7, $3 # E : Target addr misalignment
+
+ or $17, $2, $17 # E : chchchchchchchch
+ beq $1, $within_quad # U :
+ nop # E :
+ beq $3, $aligned # U : target is 0mod8
+
+ /*
+ * Target address is misaligned, and won't fit within a quadword.
+ */
+
+#ifdef pixman_error
+ /* if the addr is unaligned in multi-thread, this will cause thread
+ unsafty,so use stb to store the trailing bytes. */
+ ldl_u $4, 0($16) # L : Fetch first partial
+ mov $16, $5 # E : Save the address
+ ins3b $17, $16, $2 # U : Insert new bytes
+ subl $3, 8, $3 # E : Invert (for addressing uses)
+
+ addl $18, $3, $18 # E : $18 is new count ($3 is negative)
+ mask3b $4, $16, $4 # U : clear relevant parts of the quad
+ subl $16, $3, $16 # E : $16 is new aligned destination
+ or $2, $4, $1 # E : Final bytes
+
+ nop
+ stl_u $1,0($5) # L : Store result
+ nop
+ nop
+#else
+$misaligned:
+ stb $8, 0($16)
+ subl $18, 1, $18
+ beq $18, $end
+ addl $16, 1, $16
+ and $16, 7, $3 # E : Target addr misalignment
+ bne $3, $misaligned
+#endif
+
+ .align 4
+$aligned:
+ /*
+ * We are now guaranteed to be quad aligned, with at least
+ * one partial quad to write.
+ */
+
+ sra $18, 3, $3 # U : Number of remaining quads to write
+ and $18, 7, $18 # E : Number of trailing bytes to write
+ mov $16, $5 # E : Save dest address
+ beq $3, $no_quad # U : tail stuff only
+
+ /*
+ * It's worth the effort to unroll this and use wh64 if possible.
+ * At this point, entry values are:
+ * $16 Current destination address
+ * $5 A copy of $16
+ * $6 The max quadword address to write to
+ * $18 Number trailer bytes
+ * $3 Number quads to write
+ */
+# and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
+ and $16, 0x1f, $2 # E : Forward work (only useful for unrolled loop)
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
+ blt $4, $loop # U :
+
+ /*
+ * We know we've got at least 16 quads, minimum of one trip
+ * through unrolled loop. Do a quad at a time to get us 0mod64
+ * aligned.
+ */
+
+ nop # E :
+ nop # E :
+ nop # E :
+# beq $1, $bigalign # U :
+ beq $2, $bigalign # U :
+$alignmod32:
+ stl $17, 0($5) # L :
+ subl $3, 1, $3 # E : For consistency later
+ addl $1, 8, $1 # E : Increment towards zero for alignment
+# addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
+
+ nop
+ nop
+ addl $5, 8, $5 # E : Inc address
+ blt $1, $alignmod32 # U :
+
+
+$bigalign:
+ ldih $1, 8($31) # big than 512KB
+ cmple $18, $1, $1
+ beq $1, $do_wh64_512
+
+ /*
+ * $3 - number quads left to go
+ * $5 - target address (aligned 0mod64)
+ * $17 - mask of stuff to store
+ * Scratch registers available: $7, $2, $4, $1
+ * We know that we'll be taking a minimum of one trip through.
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+ * Assumes the wh64 needs to be for 2 trips through the loop in the
+ * future.The wh64 is issued on for the starting destination address for
+ * trip +2 through the loop, and if there are less than two trips left,
+ * the target address will be for the current trip. */
+
+$do_wh64:
+# wh64 ($4) # L1 : memory subsystem write hint
+ subl $3, 24, $2 # E : For determining future wh64 addresses
+ stl $17, 0($5) # L :
+ nop # E :
+
+# addl $5, 128, $4 # E : speculative target of next wh64
+ stl $17, 8($5) # L :
+ stl $17, 16($5) # L :
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
+
+ stl $17, 24($5) # L :
+ stl $17, 32($5) # L :
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
+ nop
+
+ stl $17, 40($5) # L :
+ stl $17, 48($5) # L :
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
+ nop
+
+ stl $17, 56($5) # L :
+ addl $5, 64, $5 # E :
+ subl $3, 8, $3 # E :
+ bge $2, $do_wh64 # U :
+
+ nop
+ nop
+ nop
+ beq $3, $no_quad # U : Might have finished already
+
+ nop
+ nop
+ nop
+ br $loop # U : Might have finished already
+
+$do_wh64_512:
+# wh64 ($4) # L1 : memory subsystem write hint
+ subl $3, 24, $2 # E : For determining future wh64 addresses
+
+ fillcs 128*1($5)
+ e_fillcs 128*5($5)
+
+# stl_nc $17, 0($5) # L :
+ stl $17, 0($5) # L :
+ nop # E :
+
+# addl $5, 128, $4 # E : speculative target of next wh64
+# stl_nc $17, 8($5) # L :
+ stl $17, 8($5) # L :
+# stl_nc $17, 16($5) # L :
+ stl $17, 16($5) # L :
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
+
+# stl_nc $17, 24($5) # L :
+ stl $17, 24($5) # L :
+# stl_nc $17, 32($5) # L :
+ stl $17, 32($5) # L :
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
+ nop
+
+# stl_nc $17, 40($5) # L :
+ stl $17, 40($5) # L :
+# stl_nc $17, 48($5) # L :
+ stl $17, 48($5) # L :
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
+ nop
+
+# stl_nc $17, 56($5) # L :
+ stl $17, 56($5) # L :
+ addl $5, 64, $5 # E :
+ subl $3, 8, $3 # E :
+ bge $2, $do_wh64_512 # U :
+
+ nop
+ nop
+ nop
+ beq $3, $no_quad # U : Might have finished already
+
+ .align 4
+ /*
+ * Simple loop for trailing quadwords, or for small amounts
+ * of data (where we can't use an unrolled loop and wh64)
+ */
+$loop:
+ stl $17, 0($5) # L :
+ subl $3, 1, $3 # E : Decrement number quads left
+ addl $5, 8, $5 # E : Inc address
+ bne $3, $loop # U : more?
+
+$no_quad:
+ /*
+ * Write 0..7 trailing bytes.
+ */
+ nop # E :
+ beq $18, $end # U : All done?
+
+#ifndef pixman_error
+/* if the addr is unaligned in multi-thread, this will cause thread unsafty,
+ so use stb to store the trailing bytes. */
+$trailing:
+ stb $17, 0($5)
+ subl $18, 1, $18
+ beq $18, $end
+ addl $5, 1, $5
+ br $trailing
+#else
+ ldl $7, 0($5) # L :
+ mask7b $7, $6, $2 # U : Mask final quad
+
+ ins7b $17, $6, $4 # U : New bits
+ or $2, $4, $1 # E : Put it all together
+ stl $1, 0($5) # L : And back to memory
+ ret $31,($26),1 # L0 :
+#endif
+
+$within_quad:
+#ifdef PIXMAN_ERROR
+ /* if the addr is unaligned in multi-thread, this will cause thread
+ unsafty,so use stb to store the trailing bytes. */
+ ldl_u $1, 0($16) # L :
+ ins3b $17, $16, $2 # U : New bits
+ mask3b $1, $16, $4 # U : Clear old
+ or $2, $4, $2 # E : New result
+
+ mask3b $2, $6, $4 # U :
+ mask7b $1, $6, $2 # U :
+ or $2, $4, $1 # E :
+ stl_u $1, 0($16) # L :
+#else
+ stb $8, 0($16)
+ subl $18, 1, $18
+ beq $18, $end
+ addl $16, 1, $16
+ br $within_quad
+#endif
+
+$end:
+ nop
+ nop
+ nop
+ ret $31,($26),1 # L0 :
+
+ END(memset)
+libc_hidden_builtin_def (memset)
diff --git a/sysdeps/sw_64/sw6b/rshift.S b/sysdeps/sw_64/sw6b/rshift.S
new file mode 100644
index 00000000..ec2a78b0
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ subl $31,$19,$20
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,8($17)
+ addl $16,8,$16
+ srl $4,$19,$5
+ addl $17,8,$17
+ subl $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,8($17)
+ subl $18,4,$18
+ ldl $2,16($17)
+ ldl $3,24($17)
+ ldl $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldl $1,40($17)
+ srl $2,$19,$22
+ ldl $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldl $3,56($17)
+ srl $4,$19,$24
+ ldl $4,64($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subl $18,4,$18
+ srl $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ sll $2,$20,$8
+ ldl $1,72($17)
+ srl $2,$19,$22
+ ldl $2,80($17)
+
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldl $31,-96($17)
+ srl $3,$19,$23
+ addl $16,32,$16
+
+ sll $4,$20,$6
+ ldl $3,88($17)
+ srl $4,$19,$24
+ ldl $4,96($17)
+
+ addl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stl $7,32($16)
+ or $5,$22,$5
+ stl $8,40($16)
+ or $6,$23,$6
+ stl $5,48($16)
+ stl $6,56($16)
+ # cool down phase 2/3
+ stl $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ stl $5,16($16)
+ stl $6,24($16)
+ stl $24,32($16)
+ ret $31,($26),1
+
+.Lend: stl $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sw6b/stxcpy.S b/sysdeps/sw_64/sw6b/stxcpy.S
new file mode 100644
index 00000000..cf07eb8e
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/stxcpy.S
@@ -0,0 +1,314 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Copy a null-terminated string from SRC to DST.
+
+ This is an internal routine used by strcpy, stpcpy, and strcat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+
+ On output:
+ t8 = bitmask (with one bit set) indicating the last byte written
+ a0 = unaligned address of the last *word* written
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ .text
+ .type __stxcpy, @function
+ .globl __stxcpy
+ .usepv __stxcpy, no
+
+ cfi_startproc
+ cfi_return_column (t9)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+ .align 4
+stxcpy_aligned:
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ ldi t2, -1 # E : build a mask against false zero
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
+ mask7b t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mask3b t0, a1, t0 # U : assemble the first output word
+ cmpgeb zero, t2, t10 # E : bits set iff null found
+ or t0, t3, t1 # E : (stall)
+ bne t10, $a_eos # U : (stall)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == a source word not containing a null. */
+ /* Nops here to separate store quads from load quads */
+
+$a_loop:
+ stl_u t1, 0(a0) # L :
+ addl a0, 8, a0 # E :
+ nop
+ nop
+
+ ldl_u t1, 0(a1) # L : Latency=3
+ addl a1, 8, a1 # E :
+ cmpgeb zero, t1, t10 # E : (3 cycle stall)
+ beq t10, $a_loop # U : (stall for t10)
+
+ /* Take care of the final (partial) word store.
+ On entry to this basic block we have:
+ t1 == the source word containing the null
+ t10 == the cmpgeb mask that found it. */
+$a_eos:
+ negl t10, t6 # E : find low bit set
+ and t10, t6, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldl_u t0, 0(a0) # L : Latency=3
+ subl t8, 1, t6 # E :
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
+ or t8, t6, t10 # E : (stall)
+
+ zap t0, t10, t0 # E : clear dst bytes <= null
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stl_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ .align 4
+__stxcpy:
+ /* Are source and destination co-aligned? */
+ xor a0, a1, t0 # E :
+ unop # E :
+ and t0, 7, t0 # E : (stall)
+ bne t0, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldl_u t1, 0(a1) # L : load first src word
+ and a0, 7, t0 # E : take care not to load a word ...
+ addl a1, 8, a1 # E :
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
+
+ ldl_u t0, 0(a0) # L :
+ br stxcpy_aligned # L0 : Latency=3
+ nop
+ nop
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, for masking back in, if needed else 0
+ t1 == the low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldl_u t2, 8(a1) # L :
+ addl a1, 8, a1 # E :
+ ext3b t1, a1, t1 # U : (stall on a1)
+ ext7b t2, a1, t4 # U : (stall on a1)
+
+ mask3b t0, a0, t0 # U :
+ or t1, t4, t1 # E :
+ mask7b t1, a0, t1 # U : (stall on t1)
+ or t0, t1, t1 # E : (stall on t1)
+
+ or t1, t6, t6 # E :
+ cmpgeb zero, t6, t10 # E : (stall)
+ ldi t6, -1 # E : for masking just below
+ bne t10, $u_final # U : (stall)
+
+ mask3b t6, a1, t6 # U : mask out the bits we have
+ or t6, t2, t2 # E : already extracted before (stall)
+ cmpgeb zero, t2, t10 # E : testing eos (stall)
+ bne t10, $u_late_head_exit # U : (stall)
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+
+ stl_u t1, 0(a0) # L : store first output word
+ addl a0, 8, a0 # E :
+ ext3b t2, a1, t0 # U : position ho-bits of lo word
+ ldl_u t2, 8(a1) # U : read next high-order source word
+
+ addl a1, 8, a1 # E :
+ cmpgeb zero, t2, t10 # E : (stall for t2)
+ nop # E :
+ bne t10, $u_eos # U : (stall)
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 3
+$u_loop:
+ ext7b t2, a1, t1 # U : extract high bits for current word
+ addl a1, 8, a1 # E : (stall)
+ ext3b t2, a1, t3 # U : extract low bits for next time (stall)
+ addl a0, 8, a0 # E :
+
+ or t0, t1, t1 # E : current dst word now complete
+ ldl_u t2, 0(a1) # L : Latency=3 load high word for next time
+ stl_u t1, -8(a0) # L : save the current word (stall)
+ mov t3, t0 # E :
+
+ cmpgeb zero, t2, t10 # E : test new word for eos
+ beq t10, $u_loop # U : (stall)
+ nop
+ nop
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ ext7b t2, a1, t1 # U :
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
+ cmpgeb zero, t1, t10 # E : is the null in this first bit? (stall)
+ bne t10, $u_final # U : (stall)
+
+$u_late_head_exit:
+ stl_u t1, 0(a0) # L : the null was in the high-order bits
+ addl a0, 8, a0 # E :
+ ext3b t2, a1, t1 # U :
+ cmpgeb zero, t1, t10 # E : (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t1 == assembled source word
+ t10 == cmpgeb mask that found the null. */
+$u_final:
+ negl t10, t6 # E : isolate low bit set
+ and t6, t10, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldl_u t0, 0(a0) # E :
+ subl t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
+
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stl_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldl_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldl_u t0, 0(a0) # L :
+ ldi t6, -1 # E :
+
+ mask3b t6, a0, t6 # U :
+ nop
+ nop
+ nop
+1:
+ subl a1, t4, a1 # E : sub dest misalignment from src addr
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+ cmplt t4, t5, t8 # E :
+ beq t8, $u_head # U :
+ ldi t2, -1 # E : mask out leading garbage in source
+
+ mask7b t2, t5, t2 # U :
+ ornot t1, t2, t3 # E : (stall)
+ cmpgeb zero, t3, t10 # E : is there a zero? (stall)
+ beq t10, $u_head # U : (stall)
+
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+
+ ldl_u t0, 0(a0) # L :
+ negl t10, t6 # E : build bitmask of bytes <= zero
+ and t6, t10, t8 # E : (stall)
+ and a1, 7, t5 # E :
+
+ subl t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ srl t8, t5, t8 # U : adjust final null return value
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
+
+ and t1, t2, t1 # E : to source validity mask
+ ext3b t2, a1, t2 # U :
+ ext3b t1, a1, t1 # U : (stall)
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
+
+ or t0, t1, t1 # e1 : and put it there
+ stl_u t1, 0(a0) # .. e0 : (stall)
+ ret (t9) # e1 :
+
+ cfi_endproc
diff --git a/sysdeps/sw_64/sw6b/stxncpy.S b/sysdeps/sw_64/sw6b/stxncpy.S
new file mode 100644
index 00000000..c47029ea
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/stxncpy.S
@@ -0,0 +1,392 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Copy no more than COUNT bytes of the null-terminated string from
+ SRC to DST.
+
+ This is an internal routine used by strncpy, stpncpy, and strncat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+ a2 = COUNT
+
+ Furthermore, COUNT may not be zero.
+
+ On output:
+ t0 = last word written
+ t8 = bitmask (with one bit set) indicating the last byte written
+ t10 = bitmask (with one bit set) indicating the byte position of
+ the end of the range specified by COUNT
+ a0 = unaligned address of the last *word* written
+ a2 = the number of full words left in COUNT
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ .text
+ .type __stxncpy, @function
+ .globl __stxncpy
+ .usepv __stxncpy, no
+
+ cfi_startproc
+ cfi_return_column (t9)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+ .align 4
+stxncpy_aligned:
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ ldi t2, -1 # E : build a mask against false zero
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
+ mask7b t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mask3b t0, a1, t0 # U : assemble the first output word
+ cmpgeb zero, t2, t7 # E : bits set iff null found
+ or t0, t3, t0 # E : (stall)
+ beq a2, $a_eoc # U :
+
+ bne t7, $a_eos # U :
+ nop
+ nop
+ nop
+
+ /* On entry to this basic block:
+ t0 == a source word not containing a null. */
+
+ /*
+ * nops here to:
+ * separate store quads from load quads
+ * limit of 1 bcond/quad to permit training
+ */
+$a_loop:
+ stl_u t0, 0(a0) # L :
+ addl a0, 8, a0 # E :
+ subl a2, 1, a2 # E :
+ nop
+
+ ldl_u t0, 0(a1) # L :
+ addl a1, 8, a1 # E :
+ cmpgeb zero, t0, t7 # E :
+ beq a2, $a_eoc # U :
+
+ beq t7, $a_loop # U :
+ nop
+ nop
+ nop
+
+ /* Take care of the final (partial) word store. At this point
+ the end-of-count bit is set in t7 iff it applies.
+
+ On entry to this basic block we have:
+ t0 == the source word containing the null
+ t7 == the cmpgeb mask that found it. */
+$a_eos:
+ negl t7, t8 # E : find low bit set
+ and t7, t8, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldl_u t1, 0(a0) # L :
+ subl t8, 1, t6 # E :
+ or t8, t6, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
+
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
+ or t0, t1, t0 # e1 : (stall)
+ nop
+ nop
+
+1: stl_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Add the end-of-count bit to the eos detection bitmask. */
+$a_eoc:
+ or t10, t7, t7 # E :
+ br $a_eos # L0 : Latency=3
+ nop
+ nop
+
+ .align 4
+__stxncpy:
+ /* Are source and destination co-aligned? */
+ ldi t2, -1 # E :
+ xor a0, a1, t1 # E :
+ and a0, 7, t0 # E : find dest misalignment
+ nop # E :
+
+ srl t2, 1, t2 # U :
+ and t1, 7, t1 # E :
+ sellt a2, t2, a2, a2 # E : bound count to LONG_MAX (stall)
+ nop # E :
+
+ addl a2, t0, a2 # E : bias count by dest misalignment
+ subl a2, 1, a2 # E : (stall)
+ and a2, 7, t2 # E : (stall)
+ ldi t10, 1 # E :
+
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
+ nop # E :
+ bne t1, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldl_u t1, 0(a1) # L : load first src word
+ addl a1, 8, a1 # E :
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
+ ldl_u t0, 0(a0) # L :
+
+ br stxncpy_aligned # U :
+ nop
+ nop
+ nop
+
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, unmasked
+ t1 == the shifted low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldl_u t2, 8(a1) # L : Latency=3 load second src word
+ addl a1, 8, a1 # E :
+ mask3b t0, a0, t0 # U : mask trailing garbage in dst
+ ext7b t2, a1, t4 # U : (3 cycle stall on t2)
+
+ or t1, t4, t1 # E : first aligned src word complete (stall)
+ mask7b t1, a0, t1 # U : mask leading garbage in src (stall)
+ or t0, t1, t0 # E : first output word complete (stall)
+ or t0, t6, t6 # E : mask original data for zero test (stall)
+
+ cmpgeb zero, t6, t7 # E :
+ beq a2, $u_eocfin # U :
+ ldi t6, -1 # E :
+ nop
+
+ bne t7, $u_final # U :
+ mask3b t6, a1, t6 # U : mask out bits already seen
+ stl_u t0, 0(a0) # L : store first output word
+ or t6, t2, t2 # E :
+
+ cmpgeb zero, t2, t7 # E : find nulls in second partial
+ addl a0, 8, a0 # E :
+ subl a2, 1, a2 # E :
+ bne t7, $u_late_head_exit # U :
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+ ext3b t2, a1, t1 # U : position hi-bits of lo word
+ beq a2, $u_eoc # U :
+ ldl_u t2, 8(a1) # L : read next high-order source word
+ addl a1, 8, a1 # E :
+
+ ext7b t2, a1, t0 # U : position lo-bits of hi word (stall)
+ cmpgeb zero, t2, t7 # E :
+ nop
+ bne t7, $u_eos # U :
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted low-order bits from the current source word
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 4
+$u_loop:
+ or t0, t1, t0 # E : current dst word now complete
+ subl a2, 1, a2 # E : decrement word count
+ ext3b t2, a1, t1 # U : extract high bits for next time
+ addl a0, 8, a0 # E :
+
+ stl_u t0, -8(a0) # L : save the current word
+ beq a2, $u_eoc # U :
+ ldl_u t2, 8(a1) # L : Latency=3 load high word for next time
+ addl a1, 8, a1 # E :
+
+ ext7b t2, a1, t0 # U : extract low bits (2 cycle stall)
+ cmpgeb zero, t2, t7 # E : test new word for eos
+ nop
+ beq t7, $u_loop # U :
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted low-order bits from the current source word
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ or t0, t1, t0 # E : first (partial) source word complete
+ nop
+ cmpgeb zero, t0, t7 # E : is the null in this first bit? (stall)
+ bne t7, $u_final # U : (stall)
+
+ stl_u t0, 0(a0) # L : the null was in the high-order bits
+ addl a0, 8, a0 # E :
+ subl a2, 1, a2 # E :
+ nop
+
+$u_late_head_exit:
+ ext3b t2, a1, t0 # U :
+ cmpgeb zero, t0, t7 # E :
+ or t7, t10, t6 # E : (stall)
+ seleq a2, t6, t7, t7 # E : Latency=2, extra map slot (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t0 == assembled source word
+ t7 == cmpgeb mask that found the null. */
+$u_final:
+ negl t7, t6 # E : isolate low bit set
+ and t6, t7, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldl_u t1, 0(a0) # L :
+ subl t8, 1, t6 # E :
+ or t6, t8, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : kill source bytes > null
+
+ zap t1, t7, t1 # U : kill dest bytes <= null
+ or t0, t1, t0 # E : (stall)
+ nop
+ nop
+
+1: stl_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+
+ /* Got to end-of-count before end of string.
+ On entry to this basic block:
+ t1 == the shifted high-order bits from the previous source word */
+$u_eoc:
+ and a1, 7, t6 # E :
+ sll t10, t6, t6 # U : (stall)
+ and t6, 0xff, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ ldl_u t2, 8(a1) # L : load final src word
+ nop
+ ext7b t2, a1, t0 # U : extract low bits for last word (stall)
+ or t1, t0, t1 # E : (stall)
+
+1: cmpgeb zero, t1, t7 # E :
+ mov t1, t0
+
+$u_eocfin: # end-of-count, final word
+ or t10, t7, t7 # E :
+ br $u_final # L0 : Latency=3
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldl_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldl_u t0, 0(a0) # L :
+ ldi t6, -1 # E :
+
+ mask3b t6, a0, t6 # U :
+ nop
+ nop
+1: subl a1, t4, a1 # E : sub dest misalignment from src addr
+
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+
+ cmplt t4, t5, t8 # E :
+ ext3b t1, a1, t1 # U : shift src into place
+ ldi t2, -1 # E : for creating masks later
+ beq t8, $u_head # U : (stall)
+
+ mask7b t2, t5, t2 # U : begin src byte validity mask
+ cmpgeb zero, t1, t7 # E : is there a zero?
+ ext3b t2, a1, t2 # U :
+ or t7, t10, t5 # E : test for end-of-count too
+
+ cmpgeb zero, t2, t3 # E :
+ seleq a2, t5, t7, t7 # E : Latency=2, extra map slot
+ nop # E : keep with seleq
+ andnot t7, t3, t7 # E : (stall)
+
+ beq t7, $u_head # U :
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+ ldl_u t0, 0(a0) # L :
+ negl t7, t6 # E : build bitmask of bytes <= zero
+ mask7b t1, t4, t1 # U :
+
+ and t6, t7, t8 # E :
+ subl t8, 1, t6 # E : (stall)
+ or t6, t8, t7 # E : (stall)
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
+
+ zapnot t1, t7, t1 # U : to source validity mask
+ andnot t0, t2, t0 # E : zero place for source to reside
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
+ stl_u t0, 0(a0) # L : (stall)
+
+ ret (t9) # L0 : Latency=3
+
+ cfi_endproc
diff --git a/sysdeps/sw_64/sw6b/sub_n.S b/sysdeps/sw_64/sw6b/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ subl $4,$0,$20 # 1st main sub
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ subl $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ subl $6,$28,$22 # 3rd main sub
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ subl $4,$28,$20 # 1st main sub
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ subl $5,$28,$21 # 2nd main sub
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ subl $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ subl $4,$28,$20 # main sub
+ ldl $1,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ subl $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/sw8a/add_n.S b/sysdeps/sw_64/sw8a/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ addl $0,$4,$20 # 1st main add
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ addl $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ addl $28,$6,$22 # 3rd main add
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ addl $4,$28,$20 # 1st main add
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ addl $5,$28,$21 # 2nd main add
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ addl $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ addl $4,$28,$20 # main add
+ ldl $4,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ addl $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/sw8a/addmul_1.S b/sysdeps/sw_64/sw8a/addmul_1.S
new file mode 100644
index 00000000..95487c26
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # . Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldl in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
+ # like not to have a ldl or stl to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ ldi $30, -240($30)
+ stl $9, 8($30)
+ stl $10, 16($30)
+ stl $11, 24($30)
+ stl $12, 32($30)
+ stl $13, 40($30)
+ stl $14, 48($30)
+ stl $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $0, $5, $0
+
+$Lunroll:
+ ldi $17, -16($17) # L1 bookkeeping
+ ldi $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldl $2, 16($17) # L1
+ ldl $3, 24($17) # L1
+ ldi $18, -1($18) # L1 bookkeeping
+ ldl $6, 16($16) # L1
+ ldl $7, 24($16) # L1
+ ldl $0, 32($17) # L1
+ mull $19, $2, $13 # U1
+ ldl $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mull $19, $3, $15 # U1
+ ldi $17, 64($17) # L1 bookkeeping
+ ldl $4, 32($16) # L1
+ ldl $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldl $2, -16($17) # L1
+ mull $19, $0, $9 # U1
+ ldl $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ mull $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+ mull $19, $3, $15 # U1
+ addl $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, 16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $18, -1($18) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 32($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $17, 64($17) # L1 bookkeeping
+ addl $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, -16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+ mull $19, $1, $11 # U1
+ addl $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+ addl $12, $21, $0 # U0 hi mul + carry
+
+ ldl $9, 8($30)
+ ldl $10, 16($30)
+ ldl $11, 24($30)
+ ldl $12, 32($30)
+ ldl $13, 40($30)
+ ldl $14, 48($30)
+ ldl $15, 56($30)
+ ldi $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/sw8a/lshift.S b/sysdeps/sw_64/sw8a/lshift.S
new file mode 100644
index 00000000..76f1fb0e
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $31,$19,$20
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,-16($17)
+ subl $16,8,$16
+ sll $4,$19,$5
+ subl $17,8,$17
+ subl $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,-16($17)
+ subl $18,4,$18
+ ldl $2,-24($17)
+ ldl $3,-32($17)
+ ldl $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldl $1,-48($17)
+ sll $2,$19,$22
+ ldl $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldl $3,-64($17)
+ sll $4,$19,$24
+ ldl $4,-72($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subl $18,4,$18
+ sll $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ srl $2,$20,$8
+ ldl $1,-80($17)
+ sll $2,$19,$22
+ ldl $2,-88($17)
+
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldl $31,-96($17)
+ sll $3,$19,$23
+ subl $16,32,$16
+
+ srl $4,$20,$6
+ ldl $3,-96($17)
+ sll $4,$19,$24
+ ldl $4,-104($17)
+
+ subl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stl $7,-40($16)
+ or $5,$22,$5
+ stl $8,-48($16)
+ or $6,$23,$6
+ stl $5,-56($16)
+ stl $6,-64($16)
+ # cool down phase 2/3
+ stl $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ stl $5,-24($16)
+ stl $6,-32($16)
+ stl $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stl $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/sw8a/rshift.S b/sysdeps/sw_64/sw8a/rshift.S
new file mode 100644
index 00000000..ec2a78b0
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ subl $31,$19,$20
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,8($17)
+ addl $16,8,$16
+ srl $4,$19,$5
+ addl $17,8,$17
+ subl $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,8($17)
+ subl $18,4,$18
+ ldl $2,16($17)
+ ldl $3,24($17)
+ ldl $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldl $1,40($17)
+ srl $2,$19,$22
+ ldl $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldl $3,56($17)
+ srl $4,$19,$24
+ ldl $4,64($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subl $18,4,$18
+ srl $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ sll $2,$20,$8
+ ldl $1,72($17)
+ srl $2,$19,$22
+ ldl $2,80($17)
+
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldl $31,-96($17)
+ srl $3,$19,$23
+ addl $16,32,$16
+
+ sll $4,$20,$6
+ ldl $3,88($17)
+ srl $4,$19,$24
+ ldl $4,96($17)
+
+ addl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stl $7,32($16)
+ or $5,$22,$5
+ stl $8,40($16)
+ or $6,$23,$6
+ stl $5,48($16)
+ stl $6,56($16)
+ # cool down phase 2/3
+ stl $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ stl $5,16($16)
+ stl $6,24($16)
+ stl $24,32($16)
+ ret $31,($26),1
+
+.Lend: stl $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sw8a/sub_n.S b/sysdeps/sw_64/sw8a/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ subl $4,$0,$20 # 1st main sub
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ subl $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ subl $6,$28,$22 # 3rd main sub
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ subl $4,$28,$20 # 1st main sub
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ subl $5,$28,$21 # 2nd main sub
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ subl $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ subl $4,$28,$20 # main sub
+ ldl $1,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ subl $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/udiv_qrnnd.S b/sysdeps/sw_64/udiv_qrnnd.S
new file mode 100644
index 00000000..054034cd
--- /dev/null
+++ b/sysdeps/sw_64/udiv_qrnnd.S
@@ -0,0 +1,159 @@
+ # Sw_64 1621 __udiv_qrnnd
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+ .set noreorder
+ .set noat
+
+ .text
+
+LEAF(__udiv_qrnnd, 0)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+#define cnt $2
+#define tmp $3
+#define rem_ptr $16
+#define n1 $17
+#define n0 $18
+#define d $19
+#define qb $20
+
+ ldi cnt,16
+ blt d,$largedivisor
+
+$loop1: cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ subl cnt,1,cnt
+ bgt cnt,$loop1
+ stl n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+$largedivisor:
+ and n0,1,$4
+
+ srl n0,1,n0
+ sll n1,63,tmp
+ or tmp,n0,n0
+ srl n1,1,n1
+
+ and d,1,$6
+ srl d,1,$5
+ addl $5,$6,$5
+
+$loop2: cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ subl cnt,1,cnt
+ bgt cnt,$loop2
+
+ addl n1,n1,n1
+ addl $4,n1,n1
+ bne $6,$Odd
+ stl n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+$Odd:
+ /* q' in n0. r' in n1 */
+ addl n1,n0,n1
+
+ cmpult n1,n0,tmp # tmp := carry from addl
+ subl n1,d,AT
+ addl n0,tmp,n0
+ selne tmp,AT,n1,n1
+
+ cmpult n1,d,tmp
+ addl n0,1,AT
+ seleq tmp,AT,n0,n0
+ subl n1,d,AT
+ seleq tmp,AT,n1,n1
+
+ stl n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+ .end __udiv_qrnnd
--
2.25.1
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。